Sha256: 22f975d3111623dac79ddb8c676f1e9ae1848c406401b2d0ececc78d198de25e

Contents?: true

Size: 1.59 KB

Versions: 133

Compression:

Stored size: 1.59 KB

Contents

#!/usr/bin/env ruby

require 'rbbt-util'
require 'rbbt/util/simpleopt'

$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands

options = SOPT.setup <<EOF
Assemble the TSV from a table extracted from a PDF

$ rbbt tsv assemble_pdf_table file.txt

When extracting tables from PDF they are often laid out one column at a time, divided by pages.
This command takes a file with the following structure:

1 A few lines containing table headers, one per line
2 A group of lines containing the values for the first column of the first page, ending in an empty line
3 More groups of lines corresponding to other columns
4 Repetitions of 2 and 3 for more pages

This script will take care of matching the columns read with the headers specified

-h--help Help
EOF

SOPT.usage if options[:help]

file = ARGV.shift

file = STDIN if file == '-' or file.nil?


txt = Misc.fixutf8(TSV.get_stream(file).read)

header, _sep, rest = txt.strip.partition("\n\n")
fields = header.split("\n")
num_columns = fields.length

columns = {}

num_columns.times do |i|
  columns[i] = []
end

lines = rest.split("\n")

while lines and lines.any?
  first_block = lines[0..lines.index("")-1]
  block_size = first_block.length
  lines = lines[block_size+1..-1]
  columns[0] << first_block
  (1..num_columns-1).each do |pos|
    block = lines[0..block_size-1]
    lines = lines[block_size+1..-1]
    columns[pos] << block
  end
end

full_columns = []
num_columns.times do |i|
  column = columns[i]
  full_columns << column.flatten
end

puts "#" << fields * "\t"
Misc.zip_fields(full_columns).zip do |values|
  puts values * "\t"
end

Version data entries

133 entries across 133 versions & 1 rubygems

Version Path
rbbt-util-5.19.16 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.15 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.14 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.13 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.12 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.11 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.10 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.9 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.8 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.7 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.6 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.5 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.4 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.3 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.2 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.1 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.19.0 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.18.1 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.18.0 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.17.89 share/rbbt_commands/tsv/assemble_pdf_table