Sha256: 76190afb9af8595adce16df29a8bc3520c9482b57caaa1b271d43b65f90c6d20
Contents?: true
Size: 1.97 KB
Versions: 2
Compression:
Stored size: 1.97 KB
Contents
# PDF Table Data Extractor # by Eresse <eresse@eresse.net> # External Includes require 'htmlentities' # Internal Includes require 'pdftdx/version' # PDF TDX Module module PDFTDX # Parser Module module Parser # Line Regex LINE_REGEX = /^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/ # Maximum Cell Length (to be considered usable data) MAX_CELL_LEN = 100 # Page Offset PAGE_OFF = 10000 # Title Cell Regex TITLE_CELL_REGEX = /<bbb>/ # Check Same Line def self.same_line data, idx_a, idx_b data[idx_a][:top] == data[idx_b][:top] end # Is All Same Data def self.is_all_same row_data n = row_data[row_data.keys[0]] row_data.inject(true) { |b, e| b && (e[1] == n) } end # Contains Unusable Data (Empty / Long Strings) def self.contains_unusable row_data row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) } end # Process Data def self.process_data data # Build Data Table table = {} data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] } # Filter Table Rows (Remove Lone Elements & Footers) table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) } # Filter Table Cells table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 } # Cleanup Table ( IS THIS NECESSARY ? ) table.reject! { |r| r.size < 2 } # DEBUG puts "=============> #{table}" end # HTML Filter def self.hfilter s s.gsub '<br/>', "\n" end # Process Page Files def self.process_page_files page_data # Build HTML Entity Decoder coder = HTMLEntities.new # Collect & Process File Data off = 0 process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
pdftdx-0.2.0 | lib/pdftdx/parser.rb |
pdftdx-0.1.0 | lib/pdftdx/parser.rb |