Skip to content
Snippets Groups Projects
Commit bb9a2ca8 authored by Matthew Hodgson's avatar Matthew Hodgson
Browse files

synthesise basig OG metadata from pages lacking it

parent 0d3d7de6
Branches
Tags
No related merge requests found
...@@ -23,6 +23,7 @@ from synapse.http.client import SpiderHttpClient ...@@ -23,6 +23,7 @@ from synapse.http.client import SpiderHttpClient
from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes
import os import os
import re
import ujson as json import ujson as json
import logging import logging
...@@ -70,6 +71,7 @@ class PreviewUrlResource(BaseMediaResource): ...@@ -70,6 +71,7 @@ class PreviewUrlResource(BaseMediaResource):
# define our OG response for this media # define our OG response for this media
elif self._is_html(media_info['media_type']): elif self._is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
tree = html.parse(media_info['filename']) tree = html.parse(media_info['filename'])
# suck it up into lxml and define our OG response. # suck it up into lxml and define our OG response.
...@@ -82,17 +84,58 @@ class PreviewUrlResource(BaseMediaResource): ...@@ -82,17 +84,58 @@ class PreviewUrlResource(BaseMediaResource):
# "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png"
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
# "og:site_name" : "Twitter" # "og:site_name" : "Twitter"
# or:
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
# "og:video:type" : "application/x-shockwave-flash",
# "og:description" : " ",
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
# "og:video:width" : "1280"
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
og = {} og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
og[tag.attrib['property']] = tag.attrib['content'] og[tag.attrib['property']] = tag.attrib['content']
if not og:
# do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
og['og:title'] = title[0].text if title else None
images = tree.xpath("//img")
big_images = [ i for i in images if (
'width' in i and 'height' in i and
i.attrib['width'] > 64 and i.attrib['height'] > 64
)] or images
og['og:image'] = images[0].attrib['src'] if images else None
text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
text = ''
for text_node in text_nodes:
if len(text) < 1024:
text += text_node + ' '
else:
break
text = re.sub(r'[\t ]+', ' ', text)
text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
text = text.strip()[:1024]
og['og:description'] = text if text else None
# TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too
# TODO: store our OG details in a cache (and expire them when stale) # TODO: store our OG details in a cache (and expire them when stale)
# TODO: delete the content to stop diskfilling, as we only ever cared about its OG # TODO: delete the content to stop diskfilling, as we only ever cared about its OG
else: else:
logger.warn("Failed to find any OG data in %s", url) logger.warn("Failed to find any OG data in %s", url)
og = {} og = {}
logger.warn(og)
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
except: except:
# XXX: if we don't explicitly respond here, the request never returns. # XXX: if we don't explicitly respond here, the request never returns.
...@@ -111,6 +154,10 @@ class PreviewUrlResource(BaseMediaResource): ...@@ -111,6 +154,10 @@ class PreviewUrlResource(BaseMediaResource):
@defer.inlineCallbacks @defer.inlineCallbacks
def _download_url(self, url, user): def _download_url(self, url, user):
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
# bot, so are we really a robot?
# XXX: horrible duplication with base_resource's _download_remote_file() # XXX: horrible duplication with base_resource's _download_remote_file()
file_id = random_string(24) file_id = random_string(24)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment