Sha256: 76190afb9af8595adce16df29a8bc3520c9482b57caaa1b271d43b65f90c6d20

Contents?: true

Size: 1.97 KB

Versions: 2

Compression:

Stored size: 1.97 KB

Contents

# PDF Table Data Extractor
# by Eresse <eresse@eresse.net>

# External Includes
require 'htmlentities'

# Internal Includes
require 'pdftdx/version'

# PDF TDX Module
module PDFTDX

	# Parser Module
	module Parser

		# Line Regex
		LINE_REGEX = /^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/

		# Maximum Cell Length (to be considered usable data)
		MAX_CELL_LEN = 100

		# Page Offset
		PAGE_OFF = 10000

		# Title Cell Regex
		TITLE_CELL_REGEX = /<bbb>/

		# Check Same Line
		def self.same_line data, idx_a, idx_b
			data[idx_a][:top] == data[idx_b][:top]
		end

		# Is All Same Data
		def self.is_all_same row_data
			n = row_data[row_data.keys[0]]
			row_data.inject(true) { |b, e| b && (e[1] == n) }
		end

		# Contains Unusable Data (Empty / Long Strings)
		def self.contains_unusable row_data
			row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
		end

		# Process Data
		def self.process_data data

			# Build Data Table
			table = {}
			data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }

			# Filter Table Rows (Remove Lone Elements & Footers)
			table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }

			# Filter Table Cells
			table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }

			# Cleanup Table ( IS THIS NECESSARY ? )
			table.reject! { |r| r.size < 2 }

			# DEBUG
			puts "=============> #{table}"
		end

		# HTML Filter
		def self.hfilter s
			s.gsub '<br/>', "\n"
		end

		# Process Page Files
		def self.process_page_files page_data

			# Build HTML Entity Decoder
			coder = HTMLEntities.new

			# Collect & Process File Data
			off = 0
			process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
		end
	end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
pdftdx-0.2.0 lib/pdftdx/parser.rb
pdftdx-0.1.0 lib/pdftdx/parser.rb