Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Add types hints to preview URL resource.
Browse files Browse the repository at this point in the history
  • Loading branch information
clokep committed Jan 13, 2021
1 parent 9895e46 commit 192b3f8
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 30 deletions.
2 changes: 1 addition & 1 deletion synapse/rest/media/v1/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self, hs: "HomeServer"):
self.max_upload_size = hs.config.max_upload_size
self.max_image_pixels = hs.config.max_image_pixels

self.primary_base_path = hs.config.media_store_path
self.primary_base_path = hs.config.media_store_path # type: str
self.filepaths = MediaFilePaths(self.primary_base_path) # type: MediaFilePaths

self.dynamic_thumbnails = hs.config.dynamic_thumbnails
Expand Down
77 changes: 48 additions & 29 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2016 OpenMarket Ltd
# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -12,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import errno
import fnmatch
Expand All @@ -23,12 +23,13 @@
import shutil
import sys
import traceback
from typing import Dict, Optional
from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, Optional, Union
from urllib import parse as urlparse

import attr

from twisted.internet.error import DNSLookupError
from twisted.web.http import Request

from synapse.api.errors import Codes, SynapseError
from synapse.http.client import SimpleHttpClient
Expand All @@ -41,13 +42,20 @@
from synapse.logging.context import make_deferred_yieldable, run_in_background
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.stringutils import random_string

from ._base import FileInfo

if TYPE_CHECKING:
from lxml import etree

from synapse.app.homeserver import HomeServer
from synapse.rest.media.v1.media_repository import MediaRepository

logger = logging.getLogger(__name__)

_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
Expand Down Expand Up @@ -119,7 +127,12 @@ class OEmbedError(Exception):
class PreviewUrlResource(DirectServeJsonResource):
isLeaf = True

def __init__(self, hs, media_repo, media_storage):
def __init__(
self,
hs: "HomeServer",
media_repo: "MediaRepository",
media_storage: MediaStorage,
):
super().__init__()

self.auth = hs.get_auth()
Expand Down Expand Up @@ -166,11 +179,11 @@ def __init__(self, hs, media_repo, media_storage):
self._start_expire_url_cache_data, 10 * 1000
)

async def _async_render_OPTIONS(self, request):
async def _async_render_OPTIONS(self, request: Request) -> None:
request.setHeader(b"Allow", b"OPTIONS, GET")
respond_with_json(request, 200, {}, send_cors=True)

async def _async_render_GET(self, request):
async def _async_render_GET(self, request: Request) -> None:

# XXX: if get_user_by_req fails, what should we do in an async render?
requester = await self.auth.get_user_by_req(request)
Expand Down Expand Up @@ -450,7 +463,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e

async def _download_url(self, url: str, user):
async def _download_url(self, url: str, user: str) -> Dict[str, Any]:
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
# bot, so are we really a robot?
Expand Down Expand Up @@ -580,7 +593,7 @@ def _start_expire_url_cache_data(self):
"expire_url_cache_data", self._expire_url_cache_data
)

async def _expire_url_cache_data(self):
async def _expire_url_cache_data(self) -> None:
"""Clean up expired url cache content, media and thumbnails.
"""
# TODO: Delete from backup media store
Expand Down Expand Up @@ -676,7 +689,9 @@ async def _expire_url_cache_data(self):
logger.debug("No media removed from url cache")


def decode_and_calc_og(body, media_uri, request_encoding=None) -> Dict[str, str]:
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
# If there's no body, nothing useful is going to be found.
if not body:
return {}
Expand All @@ -697,7 +712,7 @@ def decode_and_calc_og(body, media_uri, request_encoding=None) -> Dict[str, str]
return og


def _calc_og(tree, media_uri):
def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.

# if we see any image URLs in the OG response, then spider them
Expand Down Expand Up @@ -801,15 +816,19 @@ def _calc_og(tree, media_uri):
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
og["og:description"] = summarize_paragraphs(text_nodes)
else:
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
og["og:description"] = summarize_paragraphs([og["og:description"]])

# TODO: delete the url downloads to stop diskfilling,
# as we only ever cared about its OG
return og


def _iterate_over_text(tree, *tags_to_ignore):
def _iterate_over_text(
tree, *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
"""
Expand Down Expand Up @@ -843,32 +862,32 @@ def _iterate_over_text(tree, *tags_to_ignore):
)


def _rebase_url(url, base):
base = list(urlparse.urlparse(base))
url = list(urlparse.urlparse(url))
if not url[0]: # fix up schema
url[0] = base[0] or "http"
if not url[1]: # fix up hostname
url[1] = base[1]
if not url[2].startswith("/"):
url[2] = re.sub(r"/[^/]+$", "/", base[2]) + url[2]
return urlparse.urlunparse(url)
def _rebase_url(url: str, base: str) -> str:
base_parts = list(urlparse.urlparse(base))
url_parts = list(urlparse.urlparse(url))
if not url_parts[0]: # fix up schema
url_parts[0] = base_parts[0] or "http"
if not url_parts[1]: # fix up hostname
url_parts[1] = base_parts[1]
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
return urlparse.urlunparse(url_parts)


def _is_media(content_type):
if content_type.lower().startswith("image/"):
return True
def _is_media(content_type: str) -> bool:
return content_type.lower().startswith("image/")


def _is_html(content_type):
def _is_html(content_type: str) -> bool:
content_type = content_type.lower()
if content_type.startswith("text/html") or content_type.startswith(
return content_type.startswith("text/html") or content_type.startswith(
"application/xhtml"
):
return True
)


def summarize_paragraphs(text_nodes, min_size=200, max_size=500):
def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
# Try to get a summary of between 200 and 500 words, respecting
# first paragraph and then word boundaries.
# TODO: Respect sentences?
Expand Down

0 comments on commit 192b3f8

Please sign in to comment.