Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Support oEmbed for media previews. #7920

Merged
merged 18 commits into from
Jul 27, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/7920.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support oEmbed for media previews.
265 changes: 220 additions & 45 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from typing import Dict, Optional
from urllib import parse as urlparse

import attr
from canonicaljson import json

from twisted.internet import defer
Expand Down Expand Up @@ -56,6 +57,65 @@
OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000

ONE_HOUR = 60 * 60 * 1000

# A map of globs to API endpoints.
_oembed_globs = {
# Twitter.
"https://publish.twitter.com/oembed": [
"https://twitter.com/*/status/*",
"https://*.twitter.com/*/status/*",
"https://twitter.com/*/moments/*",
"https://*.twitter.com/*/moments/*",
# Include the HTTP versions too.
"http://twitter.com/*/status/*",
"http://*.twitter.com/*/status/*",
"http://twitter.com/*/moments/*",
"http://*.twitter.com/*/moments/*",
],
}
# Convert the globs to regular expressions.
_oembed_patterns = {}
for endpoint, globs in _oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)

# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))

pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
_oembed_patterns[re.compile(pattern)] = endpoint


@attr.s
class OEmbedResult:
# Either HTML content or URL must be provided.
html = attr.ib(type=Optional[str])
url = attr.ib(type=Optional[str])
title = attr.ib(type=str)
clokep marked this conversation as resolved.
Show resolved Hide resolved
# Number of seconds to cache the content.
cache_age = attr.ib(type=int)


class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""


class PreviewUrlResource(DirectServeJsonResource):
isLeaf = True
Expand Down Expand Up @@ -99,7 +159,7 @@ def __init__(self, hs, media_repo, media_storage):
cache_name="url_previews",
clock=self.clock,
# don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000,
expiry_ms=ONE_HOUR,
)

if self._worker_run_media_background_jobs:
Expand Down Expand Up @@ -310,6 +370,87 @@ async def _do_preview(self, url, user, ts):

return jsonog.encode("utf8")

def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.

Params:
url: The URL to check.

Returns:
A URL to use instead or None if the original URL should be used.
"""
for url_pattern, endpoint in _oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint

# No match.
return None

async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.

Params:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.

Returns:
An object representing the metadata returned.

Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
"""
try:
logger.debug("Trying to get oEmbed content for url '%s'", url)
result = await self.client.get_json(
endpoint,
# TODO Specify max height / width.
# Note that only the JSON format is supported.
args={"url": url},
)

# Ensure there's a version of 1.0.
if result.get("version") != "1.0":
raise OEmbedError("Invalid version: %s" % (result.get("version"),))

oembed_type = result.get("type")

# Ensure the cache age is None or an int.
cache_age = result.get("cache_age")
if cache_age:
cache_age = int(cache_age)

oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)

# HTML content.
if oembed_type == "rich":
oembed_result.html = result.get("html")
return oembed_result

if oembed_type == "photo":
oembed_result.url = result.get("url")
return oembed_result

# TODO Handle link and video types.

if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result

raise OEmbedError("Incompatible oEmbed information.")
clokep marked this conversation as resolved.
Show resolved Hide resolved

except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise

except Exception as e:
# Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e

async def _download_url(self, url, user):
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
Expand All @@ -319,54 +460,90 @@ async def _download_url(self, url, user):

file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

with self.media_storage.store_into_file(file_info) as (f, fname, finish):
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)
oembed_result = await self._get_oembed_content(oembed_url, url)
if oembed_result.url:
url_to_download = oembed_result.url
elif oembed_result.html:
url_to_download = None
except OEmbedError:
# If an error occurs, try doing a normal preview.
pass

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file(
url_to_download,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url_to_download, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"

download_name = get_filename_from_headers(headers)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers["ETag"][0] if "ETag" in headers else None
else:
html_bytes = oembed_result.html.encode("utf-8") # type: ignore
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
f.write(html_bytes)
anoadragon453 marked this conversation as resolved.
Show resolved Hide resolved
await finish()

media_type = "text/html"
download_name = oembed_result.title
length = len(html_bytes)
# If a specific cache age was not given, assume 1 hour.
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url
code = 200
etag = None

try:
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()

download_name = get_filename_from_headers(headers)

await self.store.store_local_media(
media_id=file_id,
media_type=media_type,
time_now_ms=self.clock.time_msec(),
time_now_ms=time_now_ms,
upload_name=download_name,
media_length=length,
user_id=user,
Expand All @@ -389,10 +566,8 @@ async def _download_url(self, url, user):
"filename": fname,
"uri": uri,
"response_code": code,
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
"expires": 60 * 60 * 1000,
"etag": headers["ETag"][0] if "ETag" in headers else None,
"expires": expires,
"etag": etag,
}

def _start_expire_url_cache_data(self):
Expand Down Expand Up @@ -449,7 +624,7 @@ async def _expire_url_cache_data(self):
# These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * 60 * 60 * 1000
expire_before = now - 2 * 24 * ONE_HOUR
media_ids = await self.store.get_url_cache_media_before(expire_before)

removed_media = []
Expand Down
Loading