"""
Subtitles information
"""
import functools
import re
from ... import errors, utils
import logging # isort:skip
_log = logging.getLogger(__name__)
langcodes = utils.LazyModule(module='langcodes', namespace=globals())
# Map various strings to canonical sutitle format names
_subtitle_formats = {
# Matroska Codec ID
# Reference: https://www.matroska.org/technical/codec_specs.html
'S_HDMV/PGS': 'PGS', # Blu-ray
'S_VOBSUB': 'VobSub', # DVD
'S_TEXT/UTF8': 'SRT', # SubRip
'S_TEXT/SSA': 'SSA', # SubStation Alpha
'S_TEXT/ASS': 'ASS', # Advanced SubStation Alpha
'S_TEXT/WEBVTT': 'VTT', # Web Video Text Tracks
# File extension
'srt': 'SRT', # SubRip
'ssa': 'SSA', # SubStation Alpha
'ass': 'ASS', # Advanced SubStation Alpha
'vtt': 'VTT', # Web Video Text Tracks
'idx': 'VobSub', # DVD
'sub': 'VobSub', # DVD
}
[docs]
class Subtitle:
"""
Dataclass that stores information about a subtitle track
:param language: BCP47 language code with optional country code (e.g. "en",
"en-UK", "es-419", "zh-HANS")
References:
https://www.matroska.org/technical/notes.html [Language Codes]
https://en.wikipedia.org/wiki/IETF_language_tag
If `language` is not a valid language code, the `language` and
`region` attributes are empty strings.
:param bool forced: Whether the subtitle track should always be played
:param bool format: Subtitle format (e.g. "SRT", "VobSub", "PGS")
If `language` is not a valid BCP47 code, it is set to "?".
"""
def __init__(self, *, language, forced=False, format=''):
try:
lc = langcodes.Language.get(language)
if not lc.is_valid():
raise ValueError
except ValueError:
self._language = '?'
self._region = ''
else:
self._language = lc.language
self._region = lc.territory or ''
self._forced = bool(forced)
self._format = str(format or '')
_language_regex = re.compile(r'^([a-zA-Z\?-]+).*$')
_format_regex = re.compile(r'^.*\[\s*(\w+?)\s*\].*?$')
[docs]
@classmethod
def from_string(cls, string):
"""Extract language code and `forced` flag from `string`"""
return cls(
language=(
cls._language_regex.sub(r'\1', string)
if cls._language_regex.search(string) else
'?'
),
forced=bool(re.search(r'forced', string)),
format=(
cls._format_regex.sub(r'\1', string)
if cls._format_regex.search(string) else
''
),
)
@property
def language(self):
"""
Language part of the `language` initialization argument, e.g. "en",
"es", "zh", etc.
"""
return self._language
@property
def region(self):
"""
Region part of the `language` initialization argument, e.g. "UK",
"419", "HANS", etc.
"""
return self._region
@property
def forced(self):
"""Whether the subtitle track should always be played"""
return self._forced
@property
def format(self):
"""Subtitle format (e.g. "SRT", "VobSub", "PGS")"""
return self._format
def __str__(self):
text = self.language
if self.region:
text += f'-{self.region}'
if self.forced:
text += ' (forced)'
if self.format:
text += f' [{self.format}]'
return text
def __repr__(self):
if self.region:
language = f'{self.language}-{self.region}'
else:
language = self.language
return f'{type(self).__name__}(language={language!r}, forced={self.forced!r}, format={self.format!r})'
def __hash__(self):
return hash((self.language, self.region, self.forced, self.format))
def __eq__(self, other):
return hash(self) == hash(other)
[docs]
def get_subtitles(content_path):
"""
Return sequence of :class:`Subtitle` objects from all subtitle sources
This function combines all ``get_subtitles_*`` functions.
If `content_path` is a multi-disc release (i.e. a directory that contains multiple directories
that each contain a VIDEO_TS or BDMV subdirectory), it returns the combined subtitles of all
discs.
:param content_path: Path to release file or directory
"""
subtitles = []
if utils.disc.is_bluray(content_path, multidisc=True):
# BDMV
for disc_path in utils.disc.bluray.get_disc_paths(content_path):
subtitles.extend(get_subtitles_from_bluray(disc_path))
elif utils.disc.is_dvd(content_path, multidisc=True):
# VIDEO_TS
for disc_path in utils.disc.dvd.get_disc_paths(content_path):
subtitles.extend(get_subtitles_from_dvd(disc_path))
else:
# Muxed subtitles in video file(s).
subtitles.extend(get_subtitles_from_mediainfo(content_path))
# Separate srt/ass/ssa files in `content_path` directory.
subtitles.extend(get_subtitles_from_text_files(content_path))
# Separate idx/sub files in `content_path` directory.
subtitles.extend(get_subtitles_from_idx_files(content_path))
return tuple(subtitles)
_srt_filename_regex = re.compile(r'(?:\.([a-z]{2,3}|[a-z]{2}-[A-Z]{2})|)\.(\w+)$')
[docs]
@functools.cache
def get_subtitles_from_text_files(content_path):
"""
Return sequence of :class:`Subtitle` objects from subtitle file names
For a video file named "foo.mkv", the subtitle language can be in the file
name, e.g. "foo.en.srt" or "foo.eng.ass".
If `content_path` is a directory, it is searched recursively for subtitle
files.
If `content_path` is not a directory, the returned sequence is empty.
:param content_path: Path to release file or directory
"""
subtitles = []
for filepath in utils.fs.file_list(content_path, extensions=('srt', 'ssa', 'ass', 'vtt')):
filename = utils.fs.basename(filepath)
match = _srt_filename_regex.search(filename)
if match:
language = match.group(1) or ''
extension = match.group(2).lower()
format = _subtitle_formats.get(extension, '')
else:
language = ''
format = ''
subtitle = Subtitle(
language=language,
forced=False,
format=format,
)
_log.debug('Found text subtitle: %s: %r', filepath, subtitle)
subtitles.append(subtitle)
_log.debug('Subtitles from text files: %r', subtitles)
return subtitles
[docs]
@functools.cache
def get_subtitles_from_idx_files(content_path):
"""
Return sequence of :class:`Subtitle` objects from ``*.idx`` files
For .idx/.sub pairs, the .idx file can contain the language.
If `content_path` is a directory, it is searched recursively for ``*.idx``
files.
If `content_path` is not a directory, the returned sequence is empty.
:param content_path: Path to release file or directory
"""
subtitles = []
for idx_filepath in utils.fs.file_list(content_path, extensions=('idx',)):
try:
language = _get_language_code_from_idx_file(idx_filepath)
except OSError as e:
_log.debug('Ignoring exception from reading %r: %r', idx_filepath, e)
else:
subtitle = Subtitle(
language=language,
forced=False,
format=_subtitle_formats['idx'],
)
_log.debug('Found IDX subtitle: %s: %r', idx_filepath, subtitle)
subtitles.append(subtitle)
_log.debug('Subtitles from idx files: %r', subtitles)
return subtitles
_idx_language_regex = re.compile(r'^id:\s*([a-z]{2,3})\b')
def _get_language_code_from_idx_file(idx_filepath):
# Example: "id: de, index: 0"
# Expect 2-letter and 3-letter codes.
with open(idx_filepath, 'r') as f:
for line in f:
match = _idx_language_regex.search(line)
if match:
language = match.group(1)
_log.debug('Found subtitle language in %s: %r', idx_filepath, language)
return language
return ''
[docs]
@functools.cache
def get_subtitles_from_dvd(content_path):
"""
Return sequence of :class:`Subtitle` objects from ``VIDEO_TS`` subdirectory
If `content_path` is not a directory or does not contain a ``VIDEO_TS`` subdirectory, the
returned sequence is empty.
:param content_path: Path directory that contains ``VIDEO_TS`` subdirectory
"""
subtitles = []
main_playlists = tuple(
playlist
for playlist in utils.disc.dvd.get_playlists(content_path)
if playlist.is_main
)
for playlist in main_playlists:
for sub in get_subtitles_from_mediainfo(playlist.filepath):
sub._format = _subtitle_formats['S_VOBSUB']
subtitles.append(sub)
_log.debug('Subtitles from DVD tree: %r', subtitles)
return subtitles
[docs]
@functools.cache
def get_subtitles_from_bluray(content_path):
"""
Return sequence of :class:`Subtitle` objects from ``BDMV`` subdirectory
If `content_path` is not a directory or does not contain a ``BDMV`` subdirectory, the
returned sequence is empty.
:param content_path: Path directory that contains ``BDMV`` subdirectory
"""
subtitles = []
main_playlists = tuple(
playlist
for playlist in utils.disc.bluray.get_playlists(content_path)
if playlist.is_main
)
for playlist in main_playlists:
for sub in get_subtitles_from_mediainfo(playlist.filepath):
sub._format = _subtitle_formats['S_HDMV/PGS']
subtitles.append(sub)
_log.debug('Subtitles from Blu-ray tree: %r', subtitles)
return subtitles