I've been trying to identify hyperlinks in PDFs, but when I use page.text
, I only capture links that are shown in the text itself.
Example 1, this works: http://www.google.com
Example 2, this doesn't work: Google
How can we use pdf-reader to capture all links? Thanks very much.
For low level access to the objects in a PDF file, use the ObjectHash class like so:
reader = PDF::Reader.new("somefile.pdf") puts reader.objects.inspect
--
You received this message because you are subscribed to the Google Groups "PDF::Reader" group.
To unsubscribe from this group and stop receiving emails from it, send an email to pdf-reader+...@googlegroups.com.
To post to this group, send email to pdf-r...@googlegroups.com.
Visit this group at https://groups.google.com/group/pdf-reader.
For more options, visit https://groups.google.com/d/optout.
reader = PDF::Reader.new("somefile.pdf")
puts reader.objects.inspect
#<PDF::Reader::ObjectHash:0x007fdace91b800 @io=#<Tempfile:/var/folders/qz/k6v28_9n49526y85259mdpl00000gn/T/open-uri20190123-3078-pfhv5d>, @xref=#<PDF::Reader::XRef:0x007fdace91b7d8 @io=#<Tempfile:/var/folders/qz/k6v28_9n49526y85259mdpl00000gn/T/open-uri20190123-3078-pfhv5d>, @junk_offset=0, @xref={1=>{0=>8944}, 2=>{0=>8652}, 3=>{0=>6857}, 4=>{0=>5800}, 5=>{0=>36}, 6=>{0=>5583}, 7=>{0=>16}, 8=>{0=>5565}, 9=>{0=>10542}, 10=>{0=>9321}, 11=>{0=>9944}, 12=>{0=>6647}, 13=>{0=>5780}, 14=>{0=>8910}, 15=>{0=>8362}, 16=>{0=>7263}, 17=>{0=>7181}, 18=>{0=>7242}, 19=>{0=>8731}, 20=>{0=>9115}, 21=>{0=>9728}, 22=>{0=>10331}, 23=>{0=>11281}, 24=>{0=>11245}, 25=>{0=>11206}, 26=>{0=>11115}, 27=>{0=>11345}}, @trailer={:Info=>#<PDF::Reader::Reference:0x007fdace910428 @id=27, @gen=0>, :ID=>["C\xD9y<\xD5\x87\xC7\f\xEEv\xD8G\x19\xC5\x84\xA3", "C\xD9y<\xD5\x87\xC7\f\xEEv\xD8G\x19\xC5\x84\xA3"], :Size=>28, :Root=>#<PDF::Reader::Reference:0x007fdace909768 @id=23, @gen=0>}>, @pdf_version=1.4, @trailer={:Info=>#<PDF::Reader::Reference:0x007fdace910428 @id=27, @gen=0>, :ID=>["C\xD9y<\xD5\x87\xC7\f\xEEv\xD8G\x19\xC5\x84\xA3", "C\xD9y<\xD5\x87\xC7\f\xEEv\xD8G\x19\xC5\x84\xA3"], :Size=>28, :Root=>#<PDF::Reader::Reference:0x007fdace909768 @id=23, @gen=0>}, @cache=#<PDF::Reader::ObjectCache:0x007fdace91b918 @objects={#<PDF::Reader::Reference:0x007fdace909768 @id=23, @gen=0>=>{:Type=>:Catalog, :Pages=>#<PDF::Reader::Reference:0x007fdac80d48f0 @id=1, @gen=0>, :Names=>#<PDF::Reader::Reference:0x007fdac80d4620 @id=24, @gen=0>}, #<PDF::Reader::Reference:0x007fdac80d48f0 @id=1, @gen=0>=>{:Type=>:Pages, :Kids=>[#<PDF::Reader::Reference:0x007fdace8f8300 @id=2, @gen=0>], :Count=>2, :MediaBox=>[0, 0, 595.27563, 841.88977], :CropBox=>[0, 0, 595.27563, 841.88977], :BleedBox=>[0, 0, 595.27563, 841.88977]}, #<PDF::Reader::Reference:0x007fdace8f8300 @id=2, @gen=0>=>{:Type=>:Pages, :Kids=>[#<PDF::Reader::Reference:0x007fdac90d2a40 @id=3, @gen=0>, #<PDF::Reader::Reference:0x007fdac90d2838 @id=15, @gen=0>], :Count=>2, :Parent=>#<PDF::Reader::Reference:0x007fdac90d0880 @id=1, @gen=0>}, #<PDF::Reader::Reference:0x007fdac90d2a40 @id=3, @gen=0>=>{:Type=>:Page, :Trans=>{}, :Parent=>#<PDF::Reader::Reference:0x007fdacd45b488 @id=2, @gen=0>, :Resources=>{:Font=>{:F0=>#<PDF::Reader::Reference:0x007fdacd458918 @id=9, @gen=0>, :F1=>#<PDF::Reader::Reference:0x007fdacd458648 @id=10, @gen=0>, :F2=>#<PDF::Reader::Reference:0x007fdacd458328 @id=11, @gen=0>}, :ProcSet=>[:PDF, :Text, :ImageB, :ImageC], :XObject=>{:I1=>#<PDF::Reader::Reference:0x007fdacd449198 @id=5, @gen=0>}}, :MediaBox=>[0, 0, 612.28351, 790.86615], :CropBox=>[0, 0, 612.28351, 790.86615], :BleedBox=>[0, 0, 612.28351, 790.86615], :Contents=>[#<PDF::Reader::Reference:0x007fdac88ca820 @id=4, @gen=0>], :Annots=>#<PDF::Reader::Reference:0x007fdac88c9218 @id=14, @gen=0>}, #<PDF::Reader::Reference:0x007fdac90d2838 @id=15, @gen=0>=>{:Type=>:Page, :Trans=>{}, :Parent=>#<PDF::Reader::Reference:0x007fdacca70040 @id=2, @gen=0>, :Resources=>{:Font=>{:F0=>#<PDF::Reader::Reference:0x007fdac80b13f0 @id=9, @gen=0>, :F1=>#<PDF::Reader::Reference:0x007fdac80b1120 @id=10, @gen=0>}, :ProcSet=>[:Text, :ImageC], :XObject=>{:I1=>#<PDF::Reader::Reference:0x007fdac80a9b78 @id=5, @gen=0>}}, :MediaBox=>[0, 0, 612.28351, 790.86615], :CropBox=>[0, 0, 612.28351, 790.86615], :BleedBox=>[0, 0, 612.28351, 790.86615], :Contents=>[#<PDF::Reader::Reference:0x007fdac88bbdc0 @id=16, @gen=0>]}}, @lru_cache={#<PDF::Reader::Reference:0x007fdace910428>=>{:CreationDate=>"D:20130325152227-05'00'", :ModDate=>"D:20130325152227-05'00'", :Creator=>"AH XSL Formatter V6.0 MR7 for Windows (x64) : 6.0.8.9416 (2013/02/26 10:36JST)", :Producer=>"Antenna House PDF Output Library 6.0.389 (Windows (x64))", :Trapped=>:False}}, @hits=6, @misses=0>, @sec_handler=#<PDF::Reader::NullSecurityHandler:0x007fdace908700>, @page_references=[#<PDF::Reader::Reference:0x007fdac90d2a40 @id=3, @gen=0>, #<PDF::Reader::Reference:0x007fdac90d2838 @id=15, @gen=0>]>
reader = PDF::Reader.new("https://www.antennahouse.com/XSLsample/pdf/sample-link_1.pdf")
puts reader.objects.inspect