@ -3,6 +3,19 @@
class LinkDetailsExtractor
include ActionView :: Helpers :: TagHelper
# Some publications wrap their JSON-LD data in their <script> tags
# in commented-out CDATA blocks, they need to be removed before
# attempting to parse JSON
CDATA_JUNK_PATTERN = %r{ ^[ \ s]*(
( / \ *[ \ s]*<! \ [CDATA \ [[ \ s]* \ * / ) # Block comment style opening
|
( / / [ \ s ] * < ! \ [ CDATA \ [ ) # Single-line comment style opening
|
( / \ *[ \ s]* \ ] \ ]>[ \ s]* \ * / ) # Block comment style closing
|
( / / [ \ s ] * \ ] \ ] > ) # Single-line comment style closing
) [ \ s ] * $ } x
class StructuredData
SUPPORTED_TYPES = %w(
NewsArticle
@ -61,6 +74,10 @@ class LinkDetailsExtractor
publisher . dig ( 'logo' , 'url' )
end
def valid?
json . present?
end
private
def author
@ -134,11 +151,11 @@ class LinkDetailsExtractor
end
def title
structured_data& . headline || opengraph_tag ( 'og:title' ) || document . xpath ( '//title' ) . map ( & :content ) . first
html_entities. decode ( structured_data& . headline || opengraph_tag ( 'og:title' ) || document . xpath ( '//title' ) . map ( & :content ) . first )
end
def description
structured_data& . description || opengraph_tag ( 'og:description' ) || meta_tag ( 'description' )
html_entities. decode ( structured_data& . description || opengraph_tag ( 'og:description' ) || meta_tag ( 'description' ) )
end
def image
@ -146,11 +163,11 @@ class LinkDetailsExtractor
end
def canonical_url
valid_url_or_nil ( opengraph_tag( 'og:url' ) || link_tag ( 'canonica l') , same_origin_only : true ) || @original_url . to_s
valid_url_or_nil ( link_tag( 'canonical' ) || opengraph_tag ( 'og:ur l') , same_origin_only : true ) || @original_url . to_s
end
def provider_name
structured_data& . publisher_name || opengraph_tag ( 'og:site_name' )
html_entities. decode ( structured_data& . publisher_name || opengraph_tag ( 'og:site_name' ) )
end
def provider_url
@ -158,7 +175,7 @@ class LinkDetailsExtractor
end
def author_name
structured_data& . author_name || opengraph_tag ( 'og:author' ) || opengraph_tag ( 'og:author:username' )
html_entities. decode ( structured_data& . author_name || opengraph_tag ( 'og:author' ) || opengraph_tag ( 'og:author:username' ) )
end
def author_url
@ -223,10 +240,24 @@ class LinkDetailsExtractor
def structured_data
@structured_data || = begin
json_ld = document . xpath ( '//script[@type="application/ld+json"]' ) . map ( & :content ) . first
json_ld . present? ? StructuredData . new ( json_ld ) : nil
rescue Oj :: ParseError
nil
# Some publications have more than one JSON-LD definition on the page,
# and some of those definitions aren't valid JSON either, so we have
# to loop through here until we find something that is the right type
# and doesn't break
document . xpath ( '//script[@type="application/ld+json"]' ) . filter_map do | element |
json_ld = element . content & . gsub ( CDATA_JUNK_PATTERN , '' )
next if json_ld . blank?
structured_data = StructuredData . new ( html_entities . decode ( json_ld ) )
next unless structured_data . valid?
structured_data
rescue Oj :: ParseError , EncodingError
Rails . logger . debug ( " Invalid JSON-LD in #{ @original_url } " )
next
end . first
end
end
@ -246,4 +277,8 @@ class LinkDetailsExtractor
detector . strip_tags = true
end
end
def html_entities
@html_entities || = HTMLEntities . new
end
end