Skip to content
Snippets Groups Projects
Unverified Commit 0fcc0ae3 authored by Patrick Cloke's avatar Patrick Cloke Committed by GitHub
Browse files

Improve URL previews for sites with only Twitter card information. (#13056)

Pull out `twitter:` meta tags when generating a preview and
use it to augment any `og:` meta tags.

Prefers Open Graph information over Twitter card information.
parent 75526152
No related branches found
No related tags found
No related merge requests found
Improve URL previews for sites which only provide Twitter Card metadata, e.g. LWN.net.
......@@ -15,7 +15,16 @@ import codecs
import itertools
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from typing import (
TYPE_CHECKING,
Callable,
Dict,
Generator,
Iterable,
Optional,
Set,
Union,
)
if TYPE_CHECKING:
from lxml import etree
......@@ -146,6 +155,70 @@ def decode_body(
return etree.fromstring(body, parser)
def _get_meta_tags(
tree: "etree.Element",
property: str,
prefix: str,
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
) -> Dict[str, Optional[str]]:
"""
Search for meta tags prefixed with a particular string.
Args:
tree: The parsed HTML document.
property: The name of the property which contains the tag name, e.g.
"property" for Open Graph.
prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
property_mapper: An optional callable to map the property to the Open Graph
form. Can return None for a key to ignore that key.
Returns:
A map of tag name to value.
"""
results: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(results) >= 50:
logger.warning(
"Skipping parsing of Open Graph for page with too many '%s:' tags",
prefix,
)
return {}
key = tag.attrib[property]
if property_mapper:
key = property_mapper(key)
# None is a special value used to ignore a value.
if key is None:
continue
results[key] = tag.attrib["content"]
return results
def _map_twitter_to_open_graph(key: str) -> Optional[str]:
"""
Map a Twitter card property to the analogous Open Graph property.
Args:
key: The Twitter card property (starts with "twitter:").
Returns:
The Open Graph property (starts with "og:") or None to have this property
be ignored.
"""
# Twitter card properties with no analogous Open Graph property.
if key == "twitter:card" or key == "twitter:creator":
return None
if key == "twitter:site":
return "og:site_name"
# Otherwise, swap twitter to og.
return "og" + key[7:]
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
......@@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
The Open Graph response as a dictionary.
"""
# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those
# URLs to avoid DoSing the server)
# Search for Open Graph (og:) meta tags, e.g.:
#
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
......@@ -176,19 +247,11 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
og: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
# TODO: grab article: meta tags too, e.g.:
og = _get_meta_tags(tree, "property", "og")
# TODO: Search for properties specific to the different Open Graph types,
# such as article: meta tags, e.g.:
#
# "article:publisher" : "https://www.facebook.com/thethudonline" />
# "article:author" content="https://www.facebook.com/thethudonline" />
# "article:tag" content="baby" />
......@@ -196,6 +259,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:published_time" content="2016-03-31T19:58:24+00:00" />
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
# Search for Twitter Card (twitter:) meta tags, e.g.:
#
# "twitter:site" : "@matrixdotorg"
# "twitter:creator" : "@matrixdotorg"
#
# Twitter cards tags also duplicate Open Graph tags.
#
# See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
# Merge the Twitter values with the Open Graph values, but do not overwrite
# information from Open Graph tags.
for key, value in twitter.items():
if key not in og:
og[key] = value
if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
......
......@@ -370,6 +370,47 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
def test_twitter_tag(self) -> None:
"""Twitter card tags should be used if nothing else is available."""
html = b"""
<html>
<meta name="twitter:card" content="summary">
<meta name="twitter:description" content="Description">
<meta name="twitter:site" content="@matrixdotorg">
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Description",
"og:site_name": "@matrixdotorg",
},
)
# But they shouldn't override Open Graph values.
html = b"""
<html>
<meta name="twitter:card" content="summary">
<meta name="twitter:description" content="Description">
<meta property="og:description" content="Real Description">
<meta name="twitter:site" content="@matrixdotorg">
<meta property="og:site_name" content="matrix.org">
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Real Description",
"og:site_name": "matrix.org",
},
)
class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment