# PDF Table Data Extractor # by Eresse # External Includes require 'htmlentities' # Internal Includes require 'pdftdx/version' # PDF TDX Module module PDFTDX # Parser Module module Parser # Line Regex LINE_REGEX = /^

]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/ # Maximum Cell Length (to be considered usable data) MAX_CELL_LEN = 100 # Page Offset PAGE_OFF = 10000 # Maximum Allowed Offset from Page Top PAGE_MAX_TOP = 1100 # Title Cell Regex TITLE_CELL_REGEX = // # Is All Same Data # Determine whether a row's cells all contain the same data. # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' } # @return [Boolean] True if all cells contain the same data, False otherwise. def self.is_all_same? row_data n = row_data[row_data.keys[0]] row_data.inject(true) { |b, e| b && (e[1] == n) } end # Contains Unusable Data (Empty / Long Strings) # Determines whether a row contains unusable data. # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' } # @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise def self.contains_unusable? row_data row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) } end # HTML Filter # Replaces HTML newlines by UNIX-style newlines. # @param [String] s A string of HTML data # @return [String] The same string of HTML data, with all newlines (
tags) converted to UNIX newlines. def self.hfilter s s.gsub '
', "\n" end # Collect Data # Extracts table-like chunks of HTML data from a hash of HTML pages. # @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['

Hello World!

', 'This is page one.'], 2 => ['Wow, another page of data !', 'Important stuff', 'That's it for page 2 !'] } # @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }] def self.collect_data data # Build HTML Entity Decoder coder = HTMLEntities.new # Collect File Data off = 0 data.collect do |_idx, page| off = off + PAGE_OFF page .select { |l| LINE_REGEX =~ l } # Collect Table-like data .collect { |l| LINE_REGEX.match l } # Extract Table Element Metadata (Position) .collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } # Produce Hash of Raw Table Data end.flatten end # Build Data Table # Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks. # @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }] # @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } } def self.build_table data table = {} data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] } table end # Filter Table Rows # Filters out rows considered unusable, empty, oversize, footers, etc... # Also, strips Top Offset info from Table Rows. # @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } } # @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }] def self.filter_rows data data .reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) } # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells) .collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows end # Determine Headered Table Length # Computes the number of rows to be included in a given headered table. # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }] # @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }] # @param [Hash] h The current header row (determine htable length from this) # @param [Fixnum] i The current header's index within the *headers* array # @return [Fixnum] The number of rows def self.htable_length table, headers, h, i (headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx] end # Sub Table Length # Computes the number of rows to be included in a given sub-table. # @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['System', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] # @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: 'System Info', idx: 0 }] # @param [Hash] t The current sub-table title row (determine stable length from this) # @param [Fixnum] i The current sub-table title's index within the *stable* array # @return [Fixnum] The number of rows def self.sub_tab_len table, stables, t, i (stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx] end # Sub-Tablize # Splits a table into multiple named tables. # @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['System', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] # @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]] def self.sub_tablize htable_data # Collect Sub-table Title Rows subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } } # Pull up Sub-tables stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } } # Data until first sub-table index is considered 'unsorted' unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx] stables << htable_data.slice(0, unsorted_end) end # Touch up Table # Splits Table into multiple headered tables. # Also, strips Left Offset info from Table Cells. # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }] # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []] def self.touch_up table # Remove Column Offsets table.collect! { |r| r.collect { |_left, cell| cell } } # Split Table into multiple Headered Tables headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } } # Pull up Headered Tables htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } } # Split Headered Tables into multiple Named Sub-Tables htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } } # Data until first Header index is considered 'unsorted' unsorted_end = headers.empty? ? table.length : headers[0][:idx] htables << sub_tablize(table.slice(0, unsorted_end)) end # Process # Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure. # @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['
Hello World!
', 'This is page one.'], 2 => ['Wow, another page of data !', 'Important stuff', 'That's it for page 2 !'] } # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }] def self.process page_data # Collect Data data = collect_data page_data # Build Data Table table = build_table data # Filter Rows table = filter_rows table # Filter Table Cells & Touch up touch_up table end end end