From fe4d086cddb75cefff3667a718dcfd7877ed67a2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Jul 2020 11:07:17 -0400 Subject: [PATCH 01/18] Support oEmbed for media previews. --- synapse/rest/media/v1/preview_url_resource.py | 227 ++++++++++++++---- tests/rest/media/v1/test_url_preview.py | 111 ++++++++- 2 files changed, 293 insertions(+), 45 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index e52c86c798f1..f28d64d9bc5c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -26,6 +26,7 @@ from typing import Dict, Optional from urllib import parse as urlparse +import attr from canonicaljson import json from twisted.internet import defer @@ -56,6 +57,38 @@ OG_TAG_NAME_MAXLEN = 50 OG_TAG_VALUE_MAXLEN = 1000 +# A map of globs to API endpoints. +_oembed_globs = { + # Twitter. + "https://publish.twitter.com/oembed": [ + "https://twitter.com/*/status/*", + "https://*.twitter.com/*/status/*", + "https://twitter.com/*/moments/*", + "https://*.twitter.com/*/moments/*", + # Include the HTTP versions too. + "http://twitter.com/*/status/*", + "http://*.twitter.com/*/status/*", + "http://twitter.com/*/moments/*", + "http://*.twitter.com/*/moments/*", + ], +} +# Convert the globs to regular expressions. +_oembed_patterns = {} +for endpoint, globs in _oembed_globs.items(): + for glob in globs: + pattern = re.compile(re.escape(glob).replace("\\*", ".+")) + _oembed_patterns[pattern] = endpoint + + +@attr.s +class OEmbedResult: + # Content or URL must be provided. + html = attr.attrib(type=str) + url = attr.attrib(type=str) + title = attr.attrib(type=str) + # Number of seconds to cache the content. + cache_age = attr.attrib(type=int) + class PreviewUrlResource(DirectServeJsonResource): isLeaf = True @@ -310,6 +343,81 @@ async def _do_preview(self, url, user, ts): return jsonog.encode("utf8") + def _get_oembed_url(self, url: str) -> Optional[str]: + """ + Check whether the URL should be downloaded as oEmbed content instead. + + Params: + url: The URL to check. + + Returns: + A URL to use instead or None if the original URL should be used. + """ + for url_pattern, endpoint in _oembed_patterns.items(): + if url_pattern.fullmatch(url): + return endpoint + + async def _get_oembed_content( + self, endpoint: str, url: str + ) -> Optional[OEmbedResult]: + """ + Request content from an oEmbed endpoint. + + Params: + endpoint: The oEmbed API endpoint. + url: The URL to pass to the API. + + Returns: + A tuple of a string and boolean. If the boolean is true the str is a + URL and should be downloaded. If it is false, the string is HTML + content to use. + """ + try: + logger.debug("Trying to get oEmbed content for url '%s'", url) + result = await self.client.get_json( + endpoint, + # TODO Specify max height / width. + # Note that only the JSON format is supported. + args={"url": url}, + ) + + # Ensure there's a version of 1.0. + if result.get("version") != "1.0": + return None + + oembed_type = result.get("type") + + # Ensure the cache age is None or an int. + cache_age = result.get("cache_age") + if cache_age: + cache_age = int(cache_age) + + oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) + + # HTML content. + if oembed_type == "rich": + oembed_result.html = result.get("html") + return oembed_result + + if oembed_type == "photo": + oembed_result.url = result.get("url") + return oembed_result + + # TODO can we do anything better for the video type besides using + # the thumbnail? + + if "thumbnail_url" in result: + oembed_result.url = result.get("thumbnail_url") + return oembed_result + + return None + + except Exception as e: + # Trap any exception and let the code follow as usual. + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading %s: %r", url, e) + return None + async def _download_url(self, url, user): # TODO: we should probably honour robots.txt... except in practice # we're most likely being explicitly triggered by a human rather than a @@ -319,54 +427,87 @@ async def _download_url(self, url, user): file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) - with self.media_storage.store_into_file(file_info) as (f, fname, finish): - try: - logger.debug("Trying to get preview for url '%s'", url) - length, headers, uri, code = await self.client.get_file( - url, - output_stream=f, - max_size=self.max_spider_size, - headers={"Accept-Language": self.url_preview_accept_language}, - ) - except SynapseError: - # Pass SynapseErrors through directly, so that the servlet - # handler will return a SynapseError to the client instead of - # blank data or a 500. - raise - except DNSLookupError: - # DNS lookup returned no results - # Note: This will also be the case if one of the resolved IP - # addresses is blacklisted - raise SynapseError( - 502, - "DNS resolution failure during URL preview generation", - Codes.UNKNOWN, - ) - except Exception as e: - # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading %s: %r", url, e) + # If this URL can be accessed via oEmbed, use that instead. + url_to_download = url + oembed_url = self._get_oembed_url(url) + if oembed_url: + # The result might be a new URL to download, or it might be HTML content. + oembed_result = await self._get_oembed_content(oembed_url, url) + if oembed_result: + if oembed_result.url: + url_to_download = oembed_result.url + elif oembed_result.html: + url_to_download = None + + if url_to_download: + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + try: + logger.debug("Trying to get preview for url '%s'", url_to_download) + length, headers, uri, code = await self.client.get_file( + url_to_download, + output_stream=f, + max_size=self.max_spider_size, + headers={"Accept-Language": self.url_preview_accept_language}, + ) + except SynapseError: + # Pass SynapseErrors through directly, so that the servlet + # handler will return a SynapseError to the client instead of + # blank data or a 500. + raise + except DNSLookupError: + # DNS lookup returned no results + # Note: This will also be the case if one of the resolved IP + # addresses is blacklisted + raise SynapseError( + 502, + "DNS resolution failure during URL preview generation", + Codes.UNKNOWN, + ) + except Exception as e: + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading %s: %r", url_to_download, e) + + raise SynapseError( + 500, + "Failed to download content: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + await finish() - raise SynapseError( - 500, - "Failed to download content: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - await finish() + if b"Content-Type" in headers: + media_type = headers[b"Content-Type"][0].decode("ascii") + else: + media_type = "application/octet-stream" + + download_name = get_filename_from_headers(headers) + + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + expires = 60 * 60 * 1000 + etag = headers["ETag"][0] if "ETag" in headers else None + else: + html_bytes = oembed_result.html.encode("utf-8") + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + f.write(html_bytes) + await finish() + + media_type = "text/html" + download_name = oembed_result.title + length = len(html_bytes) + # If a specific cache age was not given, assume 1 hour. + expires = oembed_result.cache_age or (60 * 60 * 1000) + uri = oembed_url + code = 200 + etag = None try: - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" time_now_ms = self.clock.time_msec() - download_name = get_filename_from_headers(headers) - await self.store.store_local_media( media_id=file_id, media_type=media_type, - time_now_ms=self.clock.time_msec(), + time_now_ms=time_now_ms, upload_name=download_name, media_length=length, user_id=user, @@ -389,10 +530,8 @@ async def _download_url(self, url, user): "filename": fname, "uri": uri, "response_code": code, - # FIXME: we should calculate a proper expiration based on the - # Cache-Control and Expire headers. But for now, assume 1 hour. - "expires": 60 * 60 * 1000, - "etag": headers["ETag"][0] if "ETag" in headers else None, + "expires": expires, + "etag": etag, } def _start_expire_url_cache_data(self): diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 2826211f3213..578538902b94 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -12,11 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json import os +import re import attr +from mock import patch + from twisted.internet._resolver import HostResolution from twisted.internet.address import IPv4Address, IPv6Address from twisted.internet.error import DNSLookupError @@ -24,6 +27,8 @@ from twisted.test.proto_helpers import AccumulatingProtocol from twisted.web._newclient import ResponseDone +from synapse.rest.media.v1.preview_url_resource import _oembed_patterns + from tests import unittest from tests.server import FakeTransport @@ -562,3 +567,107 @@ def test_accept_language_config_option(self): ), server.data, ) + + def test_oembed_url(self): + """Test changing a URL via OEmbed.""" + # Route the HTTP version to an HTTP endpoint so that the tests work. + with patch.dict(_oembed_patterns, { + re.compile("http://twitter\\.com/.+/status/.+"): "http://publish.twitter.com/oembed", + }): + + self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "8.8.8.8")] + + result = { + "version": "1.0", + "type": "photo", + "url": "http://cdn.twitter.com/matrixdotorg", + } + oembed_content = json.dumps(result).encode("utf-8") + + end_content = ( + b"" + b"Some Title" + b'' + b"" + ) + + request, channel = self.make_request( + "GET", "url_preview?url=http://twitter.com/matrixdotorg/status/12345", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' + ) + % (len(oembed_content),) + + oembed_content + ) + + client = self.reactor.tcpClients[1][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' + ) + % (len(end_content),) + + end_content + ) + + self.pump() + + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": "Some Title", "og:description": "hi"} + ) + + def test_oembed_html(self): + """Test an OEmbed endpoint which returns HTML.""" + # Route the HTTP version to an HTTP endpoint so that the tests work. + with patch.dict(_oembed_patterns, { + re.compile("http://twitter\\.com/.+/status/.+"): "http://publish.twitter.com/oembed", + }): + + self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] + + result = { + "version": "1.0", + "type": "rich", + "html": "
Content Preview
", + } + end_content = json.dumps(result).encode("utf-8") + + request, channel = self.make_request( + "GET", "url_preview?url=http://twitter.com/matrixdotorg/status/12345", shorthand=False + ) + request.render(self.preview_url) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' + ) + % (len(end_content),) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": None, "og:description": "Content Preview"} + ) From 21e1aaa929e72d499324aa73da1e363bd95837aa Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Jul 2020 13:07:22 -0400 Subject: [PATCH 02/18] Add newsfragment. --- changelog.d/7920.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/7920.feature diff --git a/changelog.d/7920.feature b/changelog.d/7920.feature new file mode 100644 index 000000000000..4093f5d329bc --- /dev/null +++ b/changelog.d/7920.feature @@ -0,0 +1 @@ +Support oEmbed for media previews. From 24b8398d5a88cf1aa99e6d310dbf25c287a5cac4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Jul 2020 13:26:05 -0400 Subject: [PATCH 03/18] Lint --- tests/rest/media/v1/test_url_preview.py | 37 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 578538902b94..f8b43da7a864 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -16,10 +16,10 @@ import os import re -import attr - from mock import patch +import attr + from twisted.internet._resolver import HostResolution from twisted.internet.address import IPv4Address, IPv6Address from twisted.internet.error import DNSLookupError @@ -571,9 +571,14 @@ def test_accept_language_config_option(self): def test_oembed_url(self): """Test changing a URL via OEmbed.""" # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict(_oembed_patterns, { - re.compile("http://twitter\\.com/.+/status/.+"): "http://publish.twitter.com/oembed", - }): + with patch.dict( + _oembed_patterns, + { + re.compile( + "http://twitter\\.com/.+/status/.+" + ): "http://publish.twitter.com/oembed", + }, + ): self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] self.lookups["cdn.twitter.com"] = [(IPv4Address, "8.8.8.8")] @@ -593,7 +598,9 @@ def test_oembed_url(self): ) request, channel = self.make_request( - "GET", "url_preview?url=http://twitter.com/matrixdotorg/status/12345", shorthand=False + "GET", + "url_preview?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, ) request.render(self.preview_url) self.pump() @@ -634,9 +641,14 @@ def test_oembed_url(self): def test_oembed_html(self): """Test an OEmbed endpoint which returns HTML.""" # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict(_oembed_patterns, { - re.compile("http://twitter\\.com/.+/status/.+"): "http://publish.twitter.com/oembed", - }): + with patch.dict( + _oembed_patterns, + { + re.compile( + "http://twitter\\.com/.+/status/.+" + ): "http://publish.twitter.com/oembed", + }, + ): self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] @@ -648,7 +660,9 @@ def test_oembed_html(self): end_content = json.dumps(result).encode("utf-8") request, channel = self.make_request( - "GET", "url_preview?url=http://twitter.com/matrixdotorg/status/12345", shorthand=False + "GET", + "url_preview?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, ) request.render(self.preview_url) self.pump() @@ -669,5 +683,6 @@ def test_oembed_html(self): self.pump() self.assertEqual(channel.code, 200) self.assertEqual( - channel.json_body, {"og:title": None, "og:description": "Content Preview"} + channel.json_body, + {"og:title": None, "og:description": "Content Preview"}, ) From b7e6c59007f89a80fc0dc85d123c2786e4d175c4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Jul 2020 13:36:47 -0400 Subject: [PATCH 04/18] Rework the flow a bit to make mypy happy. --- synapse/rest/media/v1/preview_url_resource.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f28d64d9bc5c..251a8bc1ae3a 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -83,13 +83,17 @@ @attr.s class OEmbedResult: # Content or URL must be provided. - html = attr.attrib(type=str) - url = attr.attrib(type=str) + html = attr.attrib(type=Optional[str]) + url = attr.attrib(type=Optional[str]) title = attr.attrib(type=str) # Number of seconds to cache the content. cache_age = attr.attrib(type=int) +class OEmbedError(Exception): + """An error occurred processing the oEmbed object.""" + + class PreviewUrlResource(DirectServeJsonResource): isLeaf = True @@ -357,9 +361,10 @@ def _get_oembed_url(self, url: str) -> Optional[str]: if url_pattern.fullmatch(url): return endpoint - async def _get_oembed_content( - self, endpoint: str, url: str - ) -> Optional[OEmbedResult]: + # No match. + return None + + async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: """ Request content from an oEmbed endpoint. @@ -368,9 +373,10 @@ async def _get_oembed_content( url: The URL to pass to the API. Returns: - A tuple of a string and boolean. If the boolean is true the str is a - URL and should be downloaded. If it is false, the string is HTML - content to use. + An object representing the metadata returned. + + Raises: + OEmbedError if fetching or parsing of the oEmbed information fails. """ try: logger.debug("Trying to get oEmbed content for url '%s'", url) @@ -383,7 +389,7 @@ async def _get_oembed_content( # Ensure there's a version of 1.0. if result.get("version") != "1.0": - return None + raise OEmbedError("Invalid version: %s" % (result.get("version"),)) oembed_type = result.get("type") @@ -410,13 +416,13 @@ async def _get_oembed_content( oembed_result.url = result.get("thumbnail_url") return oembed_result - return None + raise OEmbedError("Incompatible oEmbed information.") except Exception as e: # Trap any exception and let the code follow as usual. # FIXME: pass through 404s and other error messages nicely logger.warning("Error downloading %s: %r", url, e) - return None + raise OEmbedError() from e async def _download_url(self, url, user): # TODO: we should probably honour robots.txt... except in practice @@ -432,12 +438,15 @@ async def _download_url(self, url, user): oembed_url = self._get_oembed_url(url) if oembed_url: # The result might be a new URL to download, or it might be HTML content. - oembed_result = await self._get_oembed_content(oembed_url, url) - if oembed_result: + try: + oembed_result = await self._get_oembed_content(oembed_url, url) if oembed_result.url: url_to_download = oembed_result.url elif oembed_result.html: url_to_download = None + except OEmbedError: + # If an error occurs, try doing a normal preview. + pass if url_to_download: with self.media_storage.store_into_file(file_info) as (f, fname, finish): @@ -487,7 +496,7 @@ async def _download_url(self, url, user): expires = 60 * 60 * 1000 etag = headers["ETag"][0] if "ETag" in headers else None else: - html_bytes = oembed_result.html.encode("utf-8") + html_bytes = oembed_result.html.encode("utf-8") # type: ignore with self.media_storage.store_into_file(file_info) as (f, fname, finish): f.write(html_bytes) await finish() From 2c4874410612171b5bb50d87477e8cbbfcb8d54e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Jul 2020 13:53:14 -0400 Subject: [PATCH 05/18] Fix race condition (?) in tests. --- tests/rest/media/v1/test_url_preview.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index f8b43da7a864..ec536cfd34a0 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -618,6 +618,8 @@ def test_oembed_url(self): + oembed_content ) + self.pump() + client = self.reactor.tcpClients[1][2].buildProtocol(None) server = AccumulatingProtocol() server.makeConnection(FakeTransport(client, self.reactor)) From f051f55d92c055ea0f23bf102670f298686baff2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 08:15:08 -0400 Subject: [PATCH 06/18] Fix comments. --- tests/rest/media/v1/test_url_preview.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index ec536cfd34a0..f16ae853c1a7 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -569,7 +569,7 @@ def test_accept_language_config_option(self): ) def test_oembed_url(self): - """Test changing a URL via OEmbed.""" + """Test changing a URL via oEmbed.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( _oembed_patterns, @@ -641,7 +641,7 @@ def test_oembed_url(self): ) def test_oembed_html(self): - """Test an OEmbed endpoint which returns HTML.""" + """Test an oEmbed endpoint which returns HTML.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( _oembed_patterns, From 12bca9aed53d4c1213893e27d12bcf04354abec4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 08:39:37 -0400 Subject: [PATCH 07/18] Fix failing tests. --- tests/rest/media/v1/test_url_preview.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index f16ae853c1a7..16523ac8f2ad 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -575,9 +575,10 @@ def test_oembed_url(self): _oembed_patterns, { re.compile( - "http://twitter\\.com/.+/status/.+" + "http\\:\\/\\/twitter\\.com\\/.+\\/status\\/.+" ): "http://publish.twitter.com/oembed", }, + clear=True, ): self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] @@ -647,9 +648,10 @@ def test_oembed_html(self): _oembed_patterns, { re.compile( - "http://twitter\\.com/.+/status/.+" + "http\\:\\/\\/twitter\\.com\\/.+\\/status\\/.+" ): "http://publish.twitter.com/oembed", }, + clear=True, ): self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] From 46a5414772e515142dfea27f49fa02a51a4f5530 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 10:49:05 -0400 Subject: [PATCH 08/18] Use Python 3.7 escaping rules. --- tests/rest/media/v1/test_url_preview.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 16523ac8f2ad..f523fdb99b1a 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -575,7 +575,7 @@ def test_oembed_url(self): _oembed_patterns, { re.compile( - "http\\:\\/\\/twitter\\.com\\/.+\\/status\\/.+" + "http://twitter\\.com/.+/status/.+" ): "http://publish.twitter.com/oembed", }, clear=True, @@ -648,7 +648,7 @@ def test_oembed_html(self): _oembed_patterns, { re.compile( - "http\\:\\/\\/twitter\\.com\\/.+\\/status\\/.+" + "http://twitter\\.com/.+/status/.+" ): "http://publish.twitter.com/oembed", }, clear=True, From 814842bc286513b27e3c55effaeb2a2930638eed Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 11:06:15 -0400 Subject: [PATCH 09/18] More safely escape domain names. --- synapse/rest/media/v1/preview_url_resource.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 251a8bc1ae3a..a1769b5fcd4a 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -76,8 +76,29 @@ _oembed_patterns = {} for endpoint, globs in _oembed_globs.items(): for glob in globs: - pattern = re.compile(re.escape(glob).replace("\\*", ".+")) - _oembed_patterns[pattern] = endpoint + # Convert the glob into a sane regular expression to match against. The + # rules followed will be slightly different for the domain portion vs. + # the rest. + # + # 1. The scheme must be one of HTTP / HTTPS (and have no globs). + # 2. The domain can have globs, but we limit it to characters that can + # reasonably be a domain part. + # TODO: This does not attempt to handle Unicode domain names. + # 3. Other parts allow a glob to be any one, or more, characters. + results = urlparse.urlparse(glob) + + # Ensure the scheme does not have wildcards (and is a sane scheme). + if results.scheme not in {"http", "https"}: + raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) + + pattern = urlparse.urlunparse( + [ + results.scheme, + re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), + ] + + list(map(lambda part: re.escape(part).replace("\\*", ".+"), results[2:])) + ) + _oembed_patterns[re.compile(pattern)] = endpoint @attr.s From 2cbe7d84220d948707d8c0ee3672365bc0ad324c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 11:24:13 -0400 Subject: [PATCH 10/18] Different patching strategy in tests. --- tests/rest/media/v1/test_url_preview.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index f523fdb99b1a..7e34671191d9 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -27,8 +27,6 @@ from twisted.test.proto_helpers import AccumulatingProtocol from twisted.web._newclient import ResponseDone -from synapse.rest.media.v1.preview_url_resource import _oembed_patterns - from tests import unittest from tests.server import FakeTransport @@ -572,7 +570,7 @@ def test_oembed_url(self): """Test changing a URL via oEmbed.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( - _oembed_patterns, + "synapse.rest.media.v1.preview_url_resource._oembed_patterns", { re.compile( "http://twitter\\.com/.+/status/.+" @@ -645,7 +643,7 @@ def test_oembed_html(self): """Test an oEmbed endpoint which returns HTML.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( - _oembed_patterns, + "synapse.rest.media.v1.preview_url_resource._oembed_patterns", { re.compile( "http://twitter\\.com/.+/status/.+" From a8e20bc5ff9004d05313d5d6b6cb3566da0047bc Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Jul 2020 11:46:52 -0400 Subject: [PATCH 11/18] Formatting changes. --- synapse/rest/media/v1/preview_url_resource.py | 10 +++++----- tests/rest/media/v1/test_url_preview.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a1769b5fcd4a..f49321ca1ed0 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -96,7 +96,7 @@ results.scheme, re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), ] - + list(map(lambda part: re.escape(part).replace("\\*", ".+"), results[2:])) + + [re.escape(part).replace("\\*", ".+") for part in results[2:]] ) _oembed_patterns[re.compile(pattern)] = endpoint @@ -104,11 +104,11 @@ @attr.s class OEmbedResult: # Content or URL must be provided. - html = attr.attrib(type=Optional[str]) - url = attr.attrib(type=Optional[str]) - title = attr.attrib(type=str) + html = attr.ib(type=Optional[str]) + url = attr.ib(type=Optional[str]) + title = attr.ib(type=str) # Number of seconds to cache the content. - cache_age = attr.attrib(type=int) + cache_age = attr.ib(type=int) class OEmbedError(Exception): diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 7e34671191d9..800cb0325cdb 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -573,7 +573,7 @@ def test_oembed_url(self): "synapse.rest.media.v1.preview_url_resource._oembed_patterns", { re.compile( - "http://twitter\\.com/.+/status/.+" + r"http://twitter\.com/.+/status/.+" ): "http://publish.twitter.com/oembed", }, clear=True, @@ -646,7 +646,7 @@ def test_oembed_html(self): "synapse.rest.media.v1.preview_url_resource._oembed_patterns", { re.compile( - "http://twitter\\.com/.+/status/.+" + r"http://twitter\.com/.+/status/.+" ): "http://publish.twitter.com/oembed", }, clear=True, From 79eabaa647a1d2a6d2a5914b791bf521110bd185 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:03:57 -0400 Subject: [PATCH 12/18] Update wording from review. Co-authored-by: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com> --- synapse/rest/media/v1/preview_url_resource.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f49321ca1ed0..37dc8b85a5a9 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -82,7 +82,7 @@ # # 1. The scheme must be one of HTTP / HTTPS (and have no globs). # 2. The domain can have globs, but we limit it to characters that can - # reasonably be a domain part. + # reasonably be a domain part. # TODO: This does not attempt to handle Unicode domain names. # 3. Other parts allow a glob to be any one, or more, characters. results = urlparse.urlparse(glob) @@ -103,7 +103,7 @@ @attr.s class OEmbedResult: - # Content or URL must be provided. + # Either HTML content or URL must be provided. html = attr.ib(type=Optional[str]) url = attr.ib(type=Optional[str]) title = attr.ib(type=str) @@ -442,7 +442,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: except Exception as e: # Trap any exception and let the code follow as usual. # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading %s: %r", url, e) + logger.warning("Error downloading oEmbed metadata: %s: %r", url, e) raise OEmbedError() from e async def _download_url(self, url, user): From 37454a877176c68c0063ae4ee7c9947467f59e6e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:13:57 -0400 Subject: [PATCH 13/18] Use a constant for 1 hour. --- synapse/rest/media/v1/preview_url_resource.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 37dc8b85a5a9..a2712e4ea670 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -57,6 +57,8 @@ OG_TAG_NAME_MAXLEN = 50 OG_TAG_VALUE_MAXLEN = 1000 +ONE_HOUR = 60 * 60 * 1000 + # A map of globs to API endpoints. _oembed_globs = { # Twitter. @@ -157,7 +159,7 @@ def __init__(self, hs, media_repo, media_storage): cache_name="url_previews", clock=self.clock, # don't spider URLs more often than once an hour - expiry_ms=60 * 60 * 1000, + expiry_ms=ONE_HOUR, ) if self._worker_run_media_background_jobs: @@ -514,7 +516,7 @@ async def _download_url(self, url, user): # FIXME: we should calculate a proper expiration based on the # Cache-Control and Expire headers. But for now, assume 1 hour. - expires = 60 * 60 * 1000 + expires = ONE_HOUR etag = headers["ETag"][0] if "ETag" in headers else None else: html_bytes = oembed_result.html.encode("utf-8") # type: ignore @@ -526,7 +528,7 @@ async def _download_url(self, url, user): download_name = oembed_result.title length = len(html_bytes) # If a specific cache age was not given, assume 1 hour. - expires = oembed_result.cache_age or (60 * 60 * 1000) + expires = oembed_result.cache_age or ONE_HOUR uri = oembed_url code = 200 etag = None @@ -618,7 +620,7 @@ async def _expire_url_cache_data(self): # These may be cached for a bit on the client (i.e., they # may have a room open with a preview url thing open). # So we wait a couple of days before deleting, just in case. - expire_before = now - 2 * 24 * 60 * 60 * 1000 + expire_before = now - 2 * 24 * ONE_HOUR media_ids = await self.store.get_url_cache_media_before(expire_before) removed_media = [] From c5c10c92c0fbed7f95da3813c752a9e523e57dfe Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:18:00 -0400 Subject: [PATCH 14/18] Use private IPs in tests. --- tests/rest/media/v1/test_url_preview.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 800cb0325cdb..49c052a19e02 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -134,7 +134,7 @@ def resolveHostName( self.reactor.nameResolver = Resolver() def test_cache_returns_correct_type(self): - self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] request, channel = self.make_request( "GET", "url_preview?url=http://matrix.org", shorthand=False @@ -190,7 +190,7 @@ def test_cache_returns_correct_type(self): ) def test_non_ascii_preview_httpequiv(self): - self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] end_content = ( b"" @@ -224,7 +224,7 @@ def test_non_ascii_preview_httpequiv(self): self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") def test_non_ascii_preview_content_type(self): - self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] end_content = ( b"" @@ -257,7 +257,7 @@ def test_non_ascii_preview_content_type(self): self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") def test_overlong_title(self): - self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] end_content = ( b"" @@ -295,7 +295,7 @@ def test_ipaddr(self): """ IP addresses can be previewed directly. """ - self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")] + self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")] request, channel = self.make_request( "GET", "url_preview?url=http://example.com", shorthand=False @@ -442,7 +442,7 @@ def test_blacklisted_ip_with_external_ip(self): # Hardcode the URL resolving to the IP we want. self.lookups["example.com"] = [ (IPv4Address, "1.1.1.2"), - (IPv4Address, "8.8.8.8"), + (IPv4Address, "10.1.2.3"), ] request, channel = self.make_request( @@ -521,7 +521,7 @@ def test_accept_language_config_option(self): """ Accept-Language header is sent to the remote server """ - self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")] + self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")] # Build and make a request to the server request, channel = self.make_request( @@ -579,8 +579,8 @@ def test_oembed_url(self): clear=True, ): - self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] - self.lookups["cdn.twitter.com"] = [(IPv4Address, "8.8.8.8")] + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] result = { "version": "1.0", @@ -652,7 +652,7 @@ def test_oembed_html(self): clear=True, ): - self.lookups["publish.twitter.com"] = [(IPv4Address, "8.8.8.8")] + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] result = { "version": "1.0", From bb53316c7067d29fa926bc03d22512d2f2cbfcfd Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:19:25 -0400 Subject: [PATCH 15/18] Update comment. --- synapse/rest/media/v1/preview_url_resource.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a2712e4ea670..30123ecee26a 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -432,8 +432,7 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: oembed_result.url = result.get("url") return oembed_result - # TODO can we do anything better for the video type besides using - # the thumbnail? + # TODO Handle link and video types. if "thumbnail_url" in result: oembed_result.url = result.get("thumbnail_url") From 76328fbc0034bb1e7089b977d71cda944ade9aae Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:25:12 -0400 Subject: [PATCH 16/18] Update test names / comments. --- tests/rest/media/v1/test_url_preview.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 49c052a19e02..74765a582bfc 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -566,8 +566,8 @@ def test_accept_language_config_option(self): server.data, ) - def test_oembed_url(self): - """Test changing a URL via oEmbed.""" + def test_oembed_photo(self): + """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( "synapse.rest.media.v1.preview_url_resource._oembed_patterns", @@ -639,8 +639,8 @@ def test_oembed_url(self): channel.json_body, {"og:title": "Some Title", "og:description": "hi"} ) - def test_oembed_html(self): - """Test an oEmbed endpoint which returns HTML.""" + def test_oembed_rich(self): + """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( "synapse.rest.media.v1.preview_url_resource._oembed_patterns", From 9b4d9036b263859388cd52ec3995345a36b4253e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 08:39:33 -0400 Subject: [PATCH 17/18] Add a bit more logging in the error handling. --- synapse/rest/media/v1/preview_url_resource.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 30123ecee26a..1d669a18623f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -440,10 +440,15 @@ async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: raise OEmbedError("Incompatible oEmbed information.") + except OEmbedError as e: + # Trap OEmbedErrors first so we can directly re-raise them. + logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) + raise + except Exception as e: # Trap any exception and let the code follow as usual. # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading oEmbed metadata: %s: %r", url, e) + logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) raise OEmbedError() from e async def _download_url(self, url, user): From 5ee2b968bbd3395230630fe55327c7d836d0aed1 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 24 Jul 2020 13:02:33 -0400 Subject: [PATCH 18/18] Update type hint for reality. --- synapse/rest/media/v1/preview_url_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 1d669a18623f..13d1a6d2ed1e 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -108,7 +108,7 @@ class OEmbedResult: # Either HTML content or URL must be provided. html = attr.ib(type=Optional[str]) url = attr.ib(type=Optional[str]) - title = attr.ib(type=str) + title = attr.ib(type=Optional[str]) # Number of seconds to cache the content. cache_age = attr.ib(type=int)