"""
API for themoviedb.org
"""
import functools
import re
import urllib.parse
from .. import html, http
from ..types import ReleaseType
from . import common
from .base import WebDbApiBase
import logging # isort:skip
_log = logging.getLogger(__name__)
[docs]
class TmdbApi(WebDbApiBase):
"""API for themoviedb.org"""
name = 'tmdb'
label = 'TMDb'
no_results_info = f"{label}'s year is often slightly different."
default_config = {}
_url_base = 'http://themoviedb.org'
_soup_cache = {}
async def _get_soup(self, path, params={}):
cache_id = (path, tuple(sorted(params.items())))
if cache_id in self._soup_cache:
return self._soup_cache[cache_id]
text = await http.get(
url=f'{self._url_base}/{path.lstrip("/")}',
params=params,
user_agent='BROWSER',
cache=True,
)
self._soup_cache[cache_id] = html.parse(text)
return self._soup_cache[cache_id]
[docs]
def get_id_from_text(self, text):
# Examples:
# https://www.themoviedb.org/movie/334536-the-blackcoat-s-daughter
# https://www.themoviedb.org/tv/45016-bron-broen
match = re.search(r'\b((?i:movie|tv)/\d+)\b', text)
if match:
return match.group(1)
[docs]
async def search(self, query):
_log.debug('Searching TMDb for %s', query)
if query.id:
async def generate_result(id):
_log.debug('Getting ID: %r', id)
title_english = await self.title_english(id)
title_original = await self.title_original(id)
return _TmdbSearchResult(
tmdb_api=self,
cast=functools.partial(self.cast, id),
directors=functools.partial(self.directors, id),
genres=functools.partial(self.genres, id),
id=id,
summary=functools.partial(self.summary, id),
title=title_english or title_original,
title_english=functools.partial(self.title_english, id),
title_original=functools.partial(self.title_original, id),
type=await self.type(id),
url=await self.url(id),
year=await self.year(id),
)
if re.search(r'\b(?:movie|tv)\b', query.id):
return [await generate_result(query.id)]
else:
return [await generate_result(f'movie/{query.id}'),
await generate_result(f'tv/{query.id}')]
elif not query.title:
return []
else:
params = {'query': query.title_normalized}
if query.year is not None:
params['query'] += f' y:{query.year}'
if query.type is ReleaseType.movie:
soup = await self._get_soup('/search/movie', params=params)
data_media_type = 'movie'
elif query.type in (ReleaseType.season, ReleaseType.episode):
soup = await self._get_soup('/search/tv', params=params)
data_media_type = 'tv'
else:
movie_results = await self.search(query.copy(type=ReleaseType.movie))
series_results = await self.search(query.copy(type=ReleaseType.series))
return movie_results + series_results
# Results with the wrong type are included in the results but hidden.
for tag in soup.find_all('div', class_='hide'):
tag.clear()
items = soup.css.iselect(f'.search_results .media-card-list:has([data-media-type="{data_media_type}"]) > div')
results = tuple(
_TmdbSearchResult(soup=item, tmdb_api=self)
for item in items
)
if query.year is not None:
# Filter the search results for the queried year because TMDb is
# very smart and returns wrong search results.
return tuple(
result for result in results
if result.year == query.year
)
else:
return results
_person_url_path_regex = re.compile(r'(/person/\d+(?:-[-a-z]+|))')
def _get_persons(self, tag, role_tag=None):
a_tags = tag.find_all('a', href=self._person_url_path_regex)
persons = []
for a_tag in a_tags:
if a_tag.string:
name = a_tag.string.strip()
url_match = self._person_url_path_regex.match(a_tag["href"])
if url_match:
url_path = url_match.group(1)
url = f'{self._url_base.rstrip("/")}/{url_path.lstrip("/")}'
else:
url = ''
role = role_tag.string.strip() if role_tag and role_tag.string else ''
persons.append(common.Person(name, url=url, role=role))
return tuple(persons)
[docs]
async def cast(self, id):
cast = []
if id:
soup = await self._get_soup(id)
cards = soup.select('.people > .card')
for card in cards:
cast.extend(self._get_persons(card, role_tag=card.find('p', {'class': 'character'})))
return tuple(cast)
async def _countries(self, id):
return ()
[docs]
async def languages(self, id):
if id:
soup = await self._get_soup(id)
try:
original_language_tag = soup.find('bdi', string='Original Language').parent.parent
except AttributeError:
pass
else:
original_language_tag.strong.extract()
if language := html.as_text(original_language_tag):
return (language,)
return ()
[docs]
async def creators(self, id):
creators = []
if id:
soup = await self._get_soup(id)
profiles = soup.select('.people > .profile')
for profile in profiles:
if profile.find('p', string=re.compile(r'(?i:Creator)')):
creators.extend(self._get_persons(profile))
return tuple(creators)
[docs]
async def directors(self, id):
directors = []
if id:
soup = await self._get_soup(id)
profiles = soup.select('.people > .profile')
for profile in profiles:
if profile.find('p', string=re.compile(r'(?i:Director)')):
directors.extend(self._get_persons(profile))
return tuple(directors)
[docs]
async def genres(self, id):
genres = ()
if id:
soup = await self._get_soup(id)
genres_tag = soup.find(class_='genres')
if genres_tag:
genres = [
html.as_text(t).lower()
for t in genres_tag.find_all('a')
]
# "short" is not a genre on TMDb and keywords are wonky. But
# Wikipedia says:
# > The Academy of Motion Picture Arts and Sciences defines a short
# > film as "an original motion picture that has a running time of
# > 40 minutes or less, including all credits".
runtimes = await self.runtimes(id)
if runtimes and runtimes['default'] <= 40:
genres.append('short')
return tuple(genres)
[docs]
async def poster_url(self, id, season=None):
if id:
soup = await self._get_soup(id)
img_tag = soup.find('img', class_='poster')
if img_tag:
srcs = img_tag.get('src')
if srcs:
path = srcs.split()[0]
return urllib.parse.urljoin(self._url_base, path)
return ''
rating_min = 0.0
rating_max = 100.0
[docs]
async def rating(self, id):
if id:
soup = await self._get_soup(id)
rating_tag = soup.find(class_='user_score_chart')
if rating_tag:
try:
return float(rating_tag['data-percent'])
except (ValueError, TypeError):
pass
async def _runtimes(self, id):
runtimes = {}
if id:
soup = await self._get_soup(id)
runtimes_tag = soup.find('span', class_='runtime')
try:
text = str(runtimes_tag.string)
except AttributeError:
text = ''
minutes = 0
for unit, unit_minutes in (('h', 60), ('m', 1)):
for match in re.finditer(rf'(\d+)\s*{unit}', text):
minutes += int(match.group(1)) * unit_minutes
if minutes > 0:
runtimes['default'] = minutes
return runtimes
_no_overview_texts = (
"We don't have an overview",
'No overview found.',
)
[docs]
async def summary(self, id):
if id:
soup = await self._get_soup(id)
overview = ''.join(soup.find('div', class_='overview').stripped_strings)
if any(text in overview for text in self._no_overview_texts):
overview = ''
return overview
return ''
async def _title_original(self, id):
soup = await self._get_soup(id)
try:
# Find non-English title
title_tag = soup.find(string=re.compile(r'Original (?:Title|Name)'))
parent_tag = title_tag.parent.parent
strings = tuple(parent_tag.stripped_strings)
return strings[1]
except (AttributeError, TypeError, ValueError, IndexError):
# Default to English title
english_titles = await self._titles_english(id)
return english_titles[0]
async def _titles_english(self, id):
soup = await self._get_soup(id)
title_tag = soup.find(class_='title')
title_parts = list(title_tag.stripped_strings)
return (title_parts[0],)
[docs]
async def type(self, id):
if id:
soup = await self._get_soup(id)
network_tag = soup.find('bdi', string=re.compile(r'^Networks?$'))
if network_tag:
return ReleaseType.series
else:
return ReleaseType.movie
else:
return ReleaseType.unknown
[docs]
async def url(self, id):
if id:
return f'{self._url_base.rstrip("/")}/{id.strip("/")}'
return ''
[docs]
async def year(self, id):
if id:
soup = await self._get_soup(id)
release_date_tag = soup.find(class_='release_date')
if release_date_tag:
year = ''.join(release_date_tag.stripped_strings).strip('()')
if len(year) == 4 and year.isdigit():
return year
return ''
class _TmdbSearchResult(common.SearchResult):
def __init__(self, *, tmdb_api, soup=None, cast=None, countries=None,
directors=None, id=None, genres=None, poster=None, summary=None, title=None,
title_english=None, title_original=None, type=None, url=None,
year=None):
soup = soup or html.parse('')
id = id or self._get_id(soup)
super().__init__(
cast=cast or functools.partial(tmdb_api.cast, id),
countries=countries or (),
directors=directors or functools.partial(tmdb_api.directors, id),
id=id,
genres=genres or functools.partial(tmdb_api.genres, id),
poster=functools.partial(tmdb_api.poster, id),
summary=summary or functools.partial(tmdb_api.summary, id),
title=title or self._get_title(soup),
title_english=title_english or functools.partial(tmdb_api.title_english, id),
title_original=title_original or functools.partial(tmdb_api.title_original, id),
type=type or self._get_type(soup),
url=url or self._get_url(soup),
year=year or self._get_year(soup),
)
_id_regex = re.compile(r'^.*/((?:movie|tv)/[0-9]+).*?$')
def _get_id(self, soup):
a_tags = soup.find_all('a')
for a_tag in a_tags:
href = a_tag.get('href')
if self._id_regex.search(href):
return self._id_regex.sub(r'\1', href)
return ''
def _get_url(self, soup):
id = self._get_id(soup)
if id:
return f'{TmdbApi._url_base}/{id}'
return ''
def _get_type(self, soup):
a_tags = soup.find_all('a')
for a_tag in a_tags:
data_media_type = a_tag.get('data-media-type')
if data_media_type == 'movie':
return ReleaseType.movie
elif data_media_type == 'tv':
return ReleaseType.series
return ReleaseType.unknown
def _get_title(self, soup):
header = soup.select('h2')
if header:
# Title tag may contain other information in smaller font or dimmed.
title_tag = header[0].contents[0]
return html.as_text(title_tag)
else:
return ''
def _get_year(self, soup):
release_date = soup.find(class_='release_date')
if release_date:
match = re.search(r'(\d{4})$', release_date.string)
if match:
return match.group(1)
return ''