Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
synapse
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Monitor
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Maunium
synapse
Commits
bb9a2ca8
Commit
bb9a2ca8
authored
9 years ago
by
Matthew Hodgson
Browse files
Options
Downloads
Patches
Plain Diff
synthesise basig OG metadata from pages lacking it
parent
0d3d7de6
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
synapse/rest/media/v1/preview_url_resource.py
+47
-0
47 additions, 0 deletions
synapse/rest/media/v1/preview_url_resource.py
with
47 additions
and
0 deletions
synapse/rest/media/v1/preview_url_resource.py
+
47
−
0
View file @
bb9a2ca8
...
...
@@ -23,6 +23,7 @@ from synapse.http.client import SpiderHttpClient
from
synapse.http.server
import
request_handler
,
respond_with_json
,
respond_with_json_bytes
import
os
import
re
import
ujson
as
json
import
logging
...
...
@@ -70,6 +71,7 @@ class PreviewUrlResource(BaseMediaResource):
# define our OG response for this media
elif
self
.
_is_html
(
media_info
[
'
media_type
'
]):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
tree
=
html
.
parse
(
media_info
[
'
filename
'
])
# suck it up into lxml and define our OG response.
...
...
@@ -82,17 +84,58 @@ class PreviewUrlResource(BaseMediaResource):
# "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png"
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
# "og:site_name" : "Twitter"
# or:
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
# "og:video:type" : "application/x-shockwave-flash",
# "og:description" : " ",
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
# "og:video:width" : "1280"
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
og
=
{}
for
tag
in
tree
.
xpath
(
"
//*/meta[starts-with(@property,
'
og:
'
)]
"
):
og
[
tag
.
attrib
[
'
property
'
]]
=
tag
.
attrib
[
'
content
'
]
if
not
og
:
# do some basic spidering of the HTML
title
=
tree
.
xpath
(
"
(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]
"
)
og
[
'
og:title
'
]
=
title
[
0
].
text
if
title
else
None
images
=
tree
.
xpath
(
"
//img
"
)
big_images
=
[
i
for
i
in
images
if
(
'
width
'
in
i
and
'
height
'
in
i
and
i
.
attrib
[
'
width
'
]
>
64
and
i
.
attrib
[
'
height
'
]
>
64
)]
or
images
og
[
'
og:image
'
]
=
images
[
0
].
attrib
[
'
src
'
]
if
images
else
None
text_nodes
=
tree
.
xpath
(
"
//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()
"
)
text
=
''
for
text_node
in
text_nodes
:
if
len
(
text
)
<
1024
:
text
+=
text_node
+
'
'
else
:
break
text
=
re
.
sub
(
r
'
[\t ]+
'
,
'
'
,
text
)
text
=
re
.
sub
(
r
'
[\t \r\n]*[\r\n]+
'
,
'
\n
'
,
text
)
text
=
text
.
strip
()[:
1024
]
og
[
'
og:description
'
]
=
text
if
text
else
None
# TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too
# TODO: store our OG details in a cache (and expire them when stale)
# TODO: delete the content to stop diskfilling, as we only ever cared about its OG
else
:
logger
.
warn
(
"
Failed to find any OG data in %s
"
,
url
)
og
=
{}
logger
.
warn
(
og
)
respond_with_json_bytes
(
request
,
200
,
json
.
dumps
(
og
),
send_cors
=
True
)
except
:
# XXX: if we don't explicitly respond here, the request never returns.
...
...
@@ -111,6 +154,10 @@ class PreviewUrlResource(BaseMediaResource):
@defer.inlineCallbacks
def
_download_url
(
self
,
url
,
user
):
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
# bot, so are we really a robot?
# XXX: horrible duplication with base_resource's _download_remote_file()
file_id
=
random_string
(
24
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment