Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
synapse
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Monitor
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Maunium
synapse
Commits
5fd07da7
Commit
5fd07da7
authored
9 years ago
by
Matthew Hodgson
Browse files
Options
Downloads
Patches
Plain Diff
refactor calc_og; spider image URLs; fix xpath; add a (broken) expiringcache; loads of other fixes
parent
c60b7516
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
synapse/rest/media/v1/preview_url_resource.py
+121
-81
121 additions, 81 deletions
synapse/rest/media/v1/preview_url_resource.py
with
121 additions
and
81 deletions
synapse/rest/media/v1/preview_url_resource.py
+
121
−
81
View file @
5fd07da7
...
@@ -20,6 +20,7 @@ from twisted.internet import defer
...
@@ -20,6 +20,7 @@ from twisted.internet import defer
from
lxml
import
html
from
lxml
import
html
from
urlparse
import
urlparse
,
urlunparse
from
urlparse
import
urlparse
,
urlunparse
from
synapse.util.stringutils
import
random_string
from
synapse.util.stringutils
import
random_string
from
synapse.util.caches.expiringcache
import
ExpiringCache
from
synapse.http.client
import
SpiderHttpClient
from
synapse.http.client
import
SpiderHttpClient
from
synapse.http.server
import
request_handler
,
respond_with_json
,
respond_with_json_bytes
from
synapse.http.server
import
request_handler
,
respond_with_json
,
respond_with_json_bytes
...
@@ -36,6 +37,12 @@ class PreviewUrlResource(BaseMediaResource):
...
@@ -36,6 +37,12 @@ class PreviewUrlResource(BaseMediaResource):
def
__init__
(
self
,
hs
,
filepaths
):
def
__init__
(
self
,
hs
,
filepaths
):
BaseMediaResource
.
__init__
(
self
,
hs
,
filepaths
)
BaseMediaResource
.
__init__
(
self
,
hs
,
filepaths
)
self
.
client
=
SpiderHttpClient
(
hs
)
self
.
client
=
SpiderHttpClient
(
hs
)
self
.
cache
=
ExpiringCache
(
cache_name
=
"
url_previews
"
,
clock
=
self
.
clock
,
expiry_ms
=
60
*
60
*
1000
,
# don't spider URLs more often than once an hour
)
self
.
cache
.
start
()
def
render_GET
(
self
,
request
):
def
render_GET
(
self
,
request
):
self
.
_async_render_GET
(
request
)
self
.
_async_render_GET
(
request
)
...
@@ -50,6 +57,11 @@ class PreviewUrlResource(BaseMediaResource):
...
@@ -50,6 +57,11 @@ class PreviewUrlResource(BaseMediaResource):
requester
=
yield
self
.
auth
.
get_user_by_req
(
request
)
requester
=
yield
self
.
auth
.
get_user_by_req
(
request
)
url
=
request
.
args
.
get
(
"
url
"
)[
0
]
url
=
request
.
args
.
get
(
"
url
"
)[
0
]
if
self
.
cache
:
og
=
self
.
cache
.
get
(
url
)
respond_with_json_bytes
(
request
,
200
,
json
.
dumps
(
og
),
send_cors
=
True
)
return
# TODO: keep track of whether there's an ongoing request for this preview
# TODO: keep track of whether there's an ongoing request for this preview
# and block and return their details if there is one.
# and block and return their details if there is one.
...
@@ -74,98 +86,25 @@ class PreviewUrlResource(BaseMediaResource):
...
@@ -74,98 +86,25 @@ class PreviewUrlResource(BaseMediaResource):
elif
self
.
_is_html
(
media_info
[
'
media_type
'
]):
elif
self
.
_is_html
(
media_info
[
'
media_type
'
]):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
def
_calc_og
():
# suck it up into lxml and define our OG response.
# if we see any URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server)
# "og:type" : "article"
# "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672"
# "og:title" : "Matrix on Twitter"
# "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png"
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
# "og:site_name" : "Twitter"
# or:
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
# "og:video:type" : "application/x-shockwave-flash",
# "og:description" : " ",
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
# "og:video:width" : "1280"
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
og
=
{}
for
tag
in
tree
.
xpath
(
"
//*/meta[starts-with(@property,
'
og:
'
)]
"
):
og
[
tag
.
attrib
[
'
property
'
]]
=
tag
.
attrib
[
'
content
'
]
if
'
og:title
'
not
in
og
:
# do some basic spidering of the HTML
title
=
tree
.
xpath
(
"
(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]
"
)
og
[
'
og:title
'
]
=
title
[
0
].
text
if
title
else
None
if
'
og:image
'
not
in
og
:
meta_image
=
tree
.
xpath
(
"
//*/meta[@itemprop=
'
image
'
]/@content
"
);
if
meta_image
:
og
[
'
og:image
'
]
=
self
.
_rebase_url
(
meta_image
[
0
],
media_info
[
'
uri
'
])
else
:
images
=
[
i
for
i
in
tree
.
xpath
(
"
//img
"
)
if
'
src
'
in
i
.
attrib
]
big_images
=
[
i
for
i
in
images
if
(
'
width
'
in
i
.
attrib
and
'
height
'
in
i
.
attrib
and
i
.
attrib
[
'
width
'
]
>
64
and
i
.
attrib
[
'
height
'
]
>
64
)]
big_images
=
big_images
.
sort
(
key
=
lambda
i
:
(
-
1
*
int
(
i
.
attrib
[
'
width
'
])
*
int
(
i
.
attrib
[
'
height
'
])))
images
=
big_images
if
big_images
else
images
if
images
:
og
[
'
og:image
'
]
=
self
.
_rebase_url
(
images
[
0
].
attrib
[
'
src
'
],
media_info
[
'
uri
'
])
if
'
og:description
'
not
in
og
:
meta_description
=
tree
.
xpath
(
"
//*/meta[@name=
'
description
'
]/@content
"
);
if
meta_description
:
og
[
'
og:description
'
]
=
meta_description
[
0
]
else
:
text_nodes
=
tree
.
xpath
(
"
//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()
"
)
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()")
text
=
''
for
text_node
in
text_nodes
:
if
len
(
text
)
<
500
:
text
+=
text_node
+
'
'
else
:
break
text
=
re
.
sub
(
r
'
[\t ]+
'
,
'
'
,
text
)
text
=
re
.
sub
(
r
'
[\t \r\n]*[\r\n]+
'
,
'
\n
'
,
text
)
text
=
text
.
strip
()[:
500
]
og
[
'
og:description
'
]
=
text
if
text
else
None
# TODO: extract a favicon?
# TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too
# TODO: store our OG details in a cache (and expire them when stale)
# TODO: delete the content to stop diskfilling, as we only ever cared about its OG
return
og
try
:
try
:
tree
=
html
.
parse
(
media_info
[
'
filename
'
])
tree
=
html
.
parse
(
media_info
[
'
filename
'
])
og
=
_calc_og
(
)
og
=
yield
self
.
_calc_og
(
tree
,
media_info
,
requester
)
except
UnicodeDecodeError
:
except
UnicodeDecodeError
:
# XXX: evil evil bodge
# XXX: evil evil bodge
file
=
open
(
media_info
[
'
filename
'
])
file
=
open
(
media_info
[
'
filename
'
])
body
=
file
.
read
()
body
=
file
.
read
()
file
.
close
()
file
.
close
()
tree
=
html
.
fromstring
(
body
.
decode
(
'
utf-8
'
,
'
ignore
'
))
tree
=
html
.
fromstring
(
body
.
decode
(
'
utf-8
'
,
'
ignore
'
))
og
=
_calc_og
(
)
og
=
yield
self
.
_calc_og
(
tree
,
media_info
,
requester
)
else
:
else
:
logger
.
warn
(
"
Failed to find any OG data in %s
"
,
url
)
logger
.
warn
(
"
Failed to find any OG data in %s
"
,
url
)
og
=
{}
og
=
{}
logger
.
warn
(
og
)
if
self
.
cache
:
self
.
cache
[
url
]
=
og
logger
.
warn
(
og
);
respond_with_json_bytes
(
request
,
200
,
json
.
dumps
(
og
),
send_cors
=
True
)
respond_with_json_bytes
(
request
,
200
,
json
.
dumps
(
og
),
send_cors
=
True
)
except
:
except
:
...
@@ -182,11 +121,112 @@ class PreviewUrlResource(BaseMediaResource):
...
@@ -182,11 +121,112 @@ class PreviewUrlResource(BaseMediaResource):
)
)
raise
raise
@defer.inlineCallbacks
def
_calc_og
(
self
,
tree
,
media_info
,
requester
):
# suck our tree into lxml and define our OG response.
# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server)
# "og:type" : "article"
# "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672"
# "og:title" : "Matrix on Twitter"
# "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png"
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
# "og:site_name" : "Twitter"
# or:
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
# "og:video:type" : "application/x-shockwave-flash",
# "og:description" : " ",
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
# "og:video:width" : "1280"
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
og
=
{}
for
tag
in
tree
.
xpath
(
"
//*/meta[starts-with(@property,
'
og:
'
)]
"
):
og
[
tag
.
attrib
[
'
property
'
]]
=
tag
.
attrib
[
'
content
'
]
# TODO: grab article: meta tags too, e.g.:
# <meta property="article:publisher" content="https://www.facebook.com/thethudonline" />
# <meta property="article:author" content="https://www.facebook.com/thethudonline" />
# <meta property="article:tag" content="baby" />
# <meta property="article:section" content="Breaking News" />
# <meta property="article:published_time" content="2016-03-31T19:58:24+00:00" />
# <meta property="article:modified_time" content="2016-04-01T18:31:53+00:00" />
if
'
og:title
'
not
in
og
:
# do some basic spidering of the HTML
title
=
tree
.
xpath
(
"
(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]
"
)
og
[
'
og:title
'
]
=
title
[
0
].
text
.
strip
()
if
title
else
None
if
'
og:image
'
not
in
og
:
# TODO: extract a favicon failing all else
meta_image
=
tree
.
xpath
(
"
//*/meta[@itemprop=
'
image
'
]/@content
"
);
if
meta_image
:
og
[
'
og:image
'
]
=
self
.
_rebase_url
(
meta_image
[
0
],
media_info
[
'
uri
'
])
else
:
images
=
tree
.
xpath
(
"
//img[@src][number(@width)>10][number(@height)>10]
"
)
images
=
sorted
(
images
,
key
=
lambda
i
:
(
-
1
*
int
(
i
.
attrib
[
'
width
'
])
*
int
(
i
.
attrib
[
'
height
'
])))
if
not
images
:
images
=
tree
.
xpath
(
"
//img[@src]
"
)
if
images
:
og
[
'
og:image
'
]
=
self
.
_rebase_url
(
images
[
0
].
attrib
[
'
src
'
],
media_info
[
'
uri
'
])
# pre-cache the image for posterity
if
'
og:image
'
in
og
and
og
[
'
og:image
'
]:
image_info
=
yield
self
.
_download_url
(
og
[
'
og:image
'
],
requester
.
user
)
if
self
.
_is_media
(
image_info
[
'
media_type
'
]):
# TODO: make sure we don't choke on white-on-transparent images
dims
=
yield
self
.
_generate_local_thumbnails
(
image_info
[
'
filesystem_id
'
],
image_info
)
og
[
"
og:image
"
]
=
"
mxc://%s/%s
"
%
(
self
.
server_name
,
image_info
[
'
filesystem_id
'
])
og
[
"
og:image:type
"
]
=
image_info
[
'
media_type
'
]
og
[
"
og:image:width
"
]
=
dims
[
'
width
'
]
og
[
"
og:image:height
"
]
=
dims
[
'
height
'
]
else
:
del
og
[
"
og:image
"
]
if
'
og:description
'
not
in
og
:
meta_description
=
tree
.
xpath
(
"
//*/meta[@name=
'
description
'
]/@content
"
);
if
meta_description
:
og
[
'
og:description
'
]
=
meta_description
[
0
]
else
:
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
text_nodes
=
tree
.
xpath
(
"
//text()[not(ancestor::header | ancestor::nav | ancestor::aside |
"
+
"
ancestor::footer | ancestor::script | ancestor::style)]
"
+
"
[ancestor::body]
"
)
text
=
''
for
text_node
in
text_nodes
:
if
len
(
text
)
<
500
:
text
+=
text_node
+
'
'
else
:
break
text
=
re
.
sub
(
r
'
[\t ]+
'
,
'
'
,
text
)
text
=
re
.
sub
(
r
'
[\t \r\n]*[\r\n]+
'
,
'
\n
'
,
text
)
text
=
text
.
strip
()[:
500
]
og
[
'
og:description
'
]
=
text
if
text
else
None
# TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age }
# TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG
defer
.
returnValue
(
og
);
def
_rebase_url
(
self
,
url
,
base
):
def
_rebase_url
(
self
,
url
,
base
):
base
=
list
(
urlparse
(
base
))
base
=
list
(
urlparse
(
base
))
url
=
list
(
urlparse
(
url
))
url
=
list
(
urlparse
(
url
))
if
not
url
[
0
]
and
not
url
[
1
]:
if
not
url
[
0
]:
url
[
0
]
=
base
[
0
]
url
[
0
]
=
base
[
0
]
or
"
http
"
if
not
url
[
1
]:
url
[
1
]
=
base
[
1
]
url
[
1
]
=
base
[
1
]
if
not
url
[
2
].
startswith
(
'
/
'
):
if
not
url
[
2
].
startswith
(
'
/
'
):
url
[
2
]
=
re
.
sub
(
r
'
/[^/]+$
'
,
'
/
'
,
base
[
2
])
+
url
[
2
]
url
[
2
]
=
re
.
sub
(
r
'
/[^/]+$
'
,
'
/
'
,
base
[
2
])
+
url
[
2
]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment