Sha256: 93f925be0299682e02a417efefd157fec77c06e7121bcaf874ac40c68314debb

Contents?: true

Size: 1.81 KB

Versions: 591

Compression:

Stored size: 1.81 KB

Contents

#!/usr/bin/env ruby

require 'rbbt-util'
require 'rbbt/util/simpleopt'

$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands

options = SOPT.setup <<EOF
Assemble the TSV from a table extracted from a PDF

$ rbbt tsv assemble_pdf_table file.txt

When extracting tables from PDF they are often laid out one column at a time, divided by pages.
This command takes a file with the following structure:

1. A few lines containing table headers, one per line

2. A group of lines containing the values for the first column of the first page, ending in an empty line

3. More groups of lines corresponding to other columns

4. Repetitions of 2 and 3 for more pages

This script will take care of matching the columns read with the headers specified

-h--help Help
-r--row Each block of lines is a row, not a column
EOF

SOPT.usage if options[:help]

file = ARGV.shift

file = STDIN if file == '-' or file.nil?


txt = Misc.fixutf8(TSV.get_stream(file).read)

header, _sep, rest = txt.strip.partition("\n\n")
fields = header.split("\n")
num_columns = fields.length

columns = {}

num_columns.times do |i|
  columns[i] = []
end

lines = rest.split("\n")

while lines and lines.any?
  first_block = lines[0..lines.index("")-1]
  block_size = first_block.length
  lines = lines[block_size+1..-1]
  columns[0] << first_block
  (1..num_columns-1).each do |pos|
    next if lines.nil?
    block = lines[0..block_size-1]
    lines = lines[block_size+1..-1]
    columns[pos] << block
  end
end

if options[:row]
  columns.each do |n,list|
    list.each do |values|
      puts values * "\t"
    end
  end
else
  full_columns = []
  num_columns.times do |i|
    column = columns[i]
    full_columns << column.flatten
  end

  puts "#" << fields * "\t"
  Misc.zip_fields(full_columns).zip do |values|
    puts values * "\t"
  end
end

Version data entries

591 entries across 591 versions & 1 rubygems

Version Path
rbbt-util-5.37.8 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.37.6 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.37.4 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.37.3 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.37.1 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.37.0 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.36.0 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.35.4 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.35.3 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.35.2 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.35.1 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.27 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.26 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.25 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.24 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.23 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.22 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.21 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.20 share/rbbt_commands/tsv/assemble_pdf_table
rbbt-util-5.34.18 share/rbbt_commands/tsv/assemble_pdf_table