Skip to content
Snippets Groups Projects
Unverified Commit 3fc8fdd1 authored by Patrick Cloke's avatar Patrick Cloke Committed by GitHub
Browse files

Support oEmbed for media previews. (#7920)

Fixes previews of Twitter URLs by using their oEmbed endpoint to grab content.
parent b975fa2e
No related branches found
No related tags found
No related merge requests found
Support oEmbed for media previews.
......@@ -26,6 +26,7 @@ import traceback
from typing import Dict, Optional
from urllib import parse as urlparse
import attr
from canonicaljson import json
from twisted.internet import defer
......@@ -56,6 +57,65 @@ _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000
ONE_HOUR = 60 * 60 * 1000
# A map of globs to API endpoints.
_oembed_globs = {
# Twitter.
"https://publish.twitter.com/oembed": [
"https://twitter.com/*/status/*",
"https://*.twitter.com/*/status/*",
"https://twitter.com/*/moments/*",
"https://*.twitter.com/*/moments/*",
# Include the HTTP versions too.
"http://twitter.com/*/status/*",
"http://*.twitter.com/*/status/*",
"http://twitter.com/*/moments/*",
"http://*.twitter.com/*/moments/*",
],
}
# Convert the globs to regular expressions.
_oembed_patterns = {}
for endpoint, globs in _oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)
# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))
pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
_oembed_patterns[re.compile(pattern)] = endpoint
@attr.s
class OEmbedResult:
# Either HTML content or URL must be provided.
html = attr.ib(type=Optional[str])
url = attr.ib(type=Optional[str])
title = attr.ib(type=Optional[str])
# Number of seconds to cache the content.
cache_age = attr.ib(type=int)
class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""
class PreviewUrlResource(DirectServeJsonResource):
isLeaf = True
......@@ -99,7 +159,7 @@ class PreviewUrlResource(DirectServeJsonResource):
cache_name="url_previews",
clock=self.clock,
# don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000,
expiry_ms=ONE_HOUR,
)
if self._worker_run_media_background_jobs:
......@@ -310,6 +370,87 @@ class PreviewUrlResource(DirectServeJsonResource):
return jsonog.encode("utf8")
def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.
Params:
url: The URL to check.
Returns:
A URL to use instead or None if the original URL should be used.
"""
for url_pattern, endpoint in _oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint
# No match.
return None
async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.
Params:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
Returns:
An object representing the metadata returned.
Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
"""
try:
logger.debug("Trying to get oEmbed content for url '%s'", url)
result = await self.client.get_json(
endpoint,
# TODO Specify max height / width.
# Note that only the JSON format is supported.
args={"url": url},
)
# Ensure there's a version of 1.0.
if result.get("version") != "1.0":
raise OEmbedError("Invalid version: %s" % (result.get("version"),))
oembed_type = result.get("type")
# Ensure the cache age is None or an int.
cache_age = result.get("cache_age")
if cache_age:
cache_age = int(cache_age)
oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
# HTML content.
if oembed_type == "rich":
oembed_result.html = result.get("html")
return oembed_result
if oembed_type == "photo":
oembed_result.url = result.get("url")
return oembed_result
# TODO Handle link and video types.
if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result
raise OEmbedError("Incompatible oEmbed information.")
except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise
except Exception as e:
# Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e
async def _download_url(self, url, user):
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
......@@ -319,54 +460,90 @@ class PreviewUrlResource(DirectServeJsonResource):
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)
oembed_result = await self._get_oembed_content(oembed_url, url)
if oembed_result.url:
url_to_download = oembed_result.url
elif oembed_result.html:
url_to_download = None
except OEmbedError:
# If an error occurs, try doing a normal preview.
pass
raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file(
url_to_download,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url_to_download, e)
raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
download_name = get_filename_from_headers(headers)
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers["ETag"][0] if "ETag" in headers else None
else:
html_bytes = oembed_result.html.encode("utf-8") # type: ignore
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
f.write(html_bytes)
await finish()
media_type = "text/html"
download_name = oembed_result.title
length = len(html_bytes)
# If a specific cache age was not given, assume 1 hour.
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url
code = 200
etag = None
try:
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()
download_name = get_filename_from_headers(headers)
await self.store.store_local_media(
media_id=file_id,
media_type=media_type,
time_now_ms=self.clock.time_msec(),
time_now_ms=time_now_ms,
upload_name=download_name,
media_length=length,
user_id=user,
......@@ -389,10 +566,8 @@ class PreviewUrlResource(DirectServeJsonResource):
"filename": fname,
"uri": uri,
"response_code": code,
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
"expires": 60 * 60 * 1000,
"etag": headers["ETag"][0] if "ETag" in headers else None,
"expires": expires,
"etag": etag,
}
def _start_expire_url_cache_data(self):
......@@ -449,7 +624,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * 60 * 60 * 1000
expire_before = now - 2 * 24 * ONE_HOUR
media_ids = await self.store.get_url_cache_media_before(expire_before)
removed_media = []
......
......@@ -12,8 +12,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
from mock import patch
import attr
......@@ -131,7 +134,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.reactor.nameResolver = Resolver()
def test_cache_returns_correct_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
request, channel = self.make_request(
"GET", "url_preview?url=http://matrix.org", shorthand=False
......@@ -187,7 +190,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
)
def test_non_ascii_preview_httpequiv(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = (
b"<html><head>"
......@@ -221,7 +224,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = (
b"<html><head>"
......@@ -254,7 +257,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_overlong_title(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = (
b"<html><head>"
......@@ -292,7 +295,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
"""
IP addresses can be previewed directly.
"""
self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
request, channel = self.make_request(
"GET", "url_preview?url=http://example.com", shorthand=False
......@@ -439,7 +442,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
# Hardcode the URL resolving to the IP we want.
self.lookups["example.com"] = [
(IPv4Address, "1.1.1.2"),
(IPv4Address, "8.8.8.8"),
(IPv4Address, "10.1.2.3"),
]
request, channel = self.make_request(
......@@ -518,7 +521,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
"""
Accept-Language header is sent to the remote server
"""
self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
# Build and make a request to the server
request, channel = self.make_request(
......@@ -562,3 +565,126 @@ class URLPreviewTests(unittest.HomeserverTestCase):
),
server.data,
)
def test_oembed_photo(self):
"""Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
# Route the HTTP version to an HTTP endpoint so that the tests work.
with patch.dict(
"synapse.rest.media.v1.preview_url_resource._oembed_patterns",
{
re.compile(
r"http://twitter\.com/.+/status/.+"
): "http://publish.twitter.com/oembed",
},
clear=True,
):
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
result = {
"version": "1.0",
"type": "photo",
"url": "http://cdn.twitter.com/matrixdotorg",
}
oembed_content = json.dumps(result).encode("utf-8")
end_content = (
b"<html><head>"
b"<title>Some Title</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
request, channel = self.make_request(
"GET",
"url_preview?url=http://twitter.com/matrixdotorg/status/12345",
shorthand=False,
)
request.render(self.preview_url)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(oembed_content),)
+ oembed_content
)
self.pump()
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
)
def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
# Route the HTTP version to an HTTP endpoint so that the tests work.
with patch.dict(
"synapse.rest.media.v1.preview_url_resource._oembed_patterns",
{
re.compile(
r"http://twitter\.com/.+/status/.+"
): "http://publish.twitter.com/oembed",
},
clear=True,
):
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
result = {
"version": "1.0",
"type": "rich",
"html": "<div>Content Preview</div>",
}
end_content = json.dumps(result).encode("utf-8")
request, channel = self.make_request(
"GET",
"url_preview?url=http://twitter.com/matrixdotorg/status/12345",
shorthand=False,
)
request.render(self.preview_url)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment