package Omni::Omnirun;
# Configuration
use strict;
# Local libraries
use Omni::Config;
use Omni::Omniword;
# Extern libraries
use XML::Twig;
use XML::Parser;
# Global variables
my $tag_list = $Omni::Config::tag_list;
my $att_list = $Omni::Config::att_list;
my $obj_list = $Omni::Config::obj_list;
# Temporary variables
my $tmp_content = undef;
my $tmp_font_face = undef;
my $tmp_font_family = undef;
my $tmp_font_pitch = undef;
my $tmp_font_size = undef;
my $tmp_spacing = undef;
my $tmp_su_script = undef; # sub-script or super-script
my $tmp_underline = undef;
my $tmp_bold = undef;
my $tmp_italic = undef;
my @tmp_words = ();
# A run object in Omnipage xml: a run contains zero or many words
# Do Hoang Nhat Huy, 07 Jan 2011
# Initialization
sub new
my ($class) = @_;
# Words: a run can have multiple words
my @words = ();
# Class members
my $self = { '_self' => $obj_list->{ 'OMNIRUN' },
'_raw' => undef,
'_content' => undef,
'_font_face' => undef,
'_font_family' => undef,
'_font_pitch' => undef,
'_font_size' => undef,
'_spacing' => undef,
'_su_script' => undef, # sub-script or super-script
'_underline' => undef,
'_bold' => undef,
'_italic' => undef,
'_words' => \@words };
bless $self, $class;
return $self;
sub set_raw
my ($self, $raw) = @_;
# Save the raw xml ...
$self->{ '_raw' } = $raw;
# Parse the raw string
my $twig_roots = { $tag_list->{ 'RUN' } => 1 };
my $twig_handlers = { $tag_list->{ 'RUN' } => \&parse};
# XML::Twig
my $twig= new XML::Twig( twig_roots => $twig_roots,
twig_handlers => $twig_handlers,
pretty_print => 'indented' );
# Start the XML parsing
# Copy information from temporary variables to class members
$self->{ '_font_face' } = $tmp_font_face;
$self->{ '_font_family' } = $tmp_font_family;
$self->{ '_font_pitch' } = $tmp_font_pitch;
$self->{ '_font_size' } = $tmp_font_size;
$self->{ '_spacing' } = $tmp_spacing;
$self->{ '_su_script' } = $tmp_su_script;
$self->{ '_underline' } = $tmp_underline;
$self->{ '_bold' } = $tmp_bold;
$self->{ '_italic' } = $tmp_italic;
# Copy all words
@{ $self->{ '_words' } } = @tmp_words;
# Copy content
$self->{ '_content' } = $tmp_content;
sub get_raw
my ($self) = @_;
return $self->{ '_raw' };
sub parse
my ($twig, $node) = @_;
# Get node attributes
$tmp_font_face = GetNodeAttr($node, $att_list->{ 'FONTFACE' });
$tmp_font_family = GetNodeAttr($node, $att_list->{ 'FONTFAMILY' });
$tmp_font_pitch = GetNodeAttr($node, $att_list->{ 'FONTPITCH' });
$tmp_font_size = GetNodeAttr($node, $att_list->{ 'FONTSIZE' });
$tmp_spacing = GetNodeAttr($node, $att_list->{ 'SPACING' });
$tmp_su_script = GetNodeAttr($node, $att_list->{ 'SUSCRIPT' }); # sub-script or super-script
$tmp_underline = GetNodeAttr($node, $att_list->{ 'UNDERLINE' });
$tmp_bold = GetNodeAttr($node, $att_list->{ 'BOLD' });
$tmp_italic = GetNodeAttr($node, $att_list->{ 'ITALIC' });
# At first, content is blank
$tmp_content = "";
# because there's no word
@tmp_words = ();
# Check if there's any child
my $child = $node->first_child();
# Has some child
# #PCDATA$ is the returned path from XML::Twig if $child is data content
if ((defined $child) && ($child->path() =~ m/#PCDATA$/))
my $content = undef;
$content = GetNodeText($node);
$content =~ s/^\s+|\s+$//g;
# Save the content
$tmp_content = $tmp_content . $content;
# Some type of separator
my $space_tag = $tag_list->{ 'SPACE' };
my $tab_tag = $tag_list->{ 'TAB' };
my $newline_tag = $tag_list->{ 'NEWLINE' };
my $word_tag = $tag_list->{ 'WORD' };
# Get every word in the together with and ...
my $obj = $node->first_child();
while (defined $obj)
my $xpath = $obj->path();
# if this child is
if ($xpath =~ m/\/$word_tag$/)
my $word = new Omni::Omniword();
# Set raw content
# Update word list
push @tmp_words, $word;
# Update content
$tmp_content = $tmp_content . $word->get_content;
# if this child is
elsif ($xpath =~ m/\/$space_tag$/)
$tmp_content = $tmp_content . " ";
# if this child is
elsif ($xpath =~ m/\/$tab_tag$/)
$tmp_content = $tmp_content . "\t";
# if this child is
#elsif ($xpath =~ m/\/$newline_tag$/)
# $tmp_content = $tmp_content . "\n";
# Little brother
if ($obj->is_last_child)
$obj = $obj->next_sibling();
sub add_word
my ($self, $word) = @_;
push @{ $self->{ '_words' } }, $word;
sub get_name
my ($self) = @_;
return $self->{ '_self' };
sub get_objs_ref
my ($self) = @_;
return $self->{ '_words' };
sub get_content
my ($self) = @_;
return $self->{ '_content' };
sub get_font_face
my ($self) = @_;
return $self->{ '_font_face' };
sub get_font_family
my ($self) = @_;
return $self->{ '_font_family' };
sub get_font_pitch
my ($self) = @_;
return $self->{ '_font_pitch' };
sub get_font_size
my ($self) = @_;
return $self->{ '_font_size' };
sub get_spacing
my ($self) = @_;
return $self->{ '_spacing' };
sub get_suscript
my ($self) = @_;
return $self->{ '_su_script' };
sub get_underline
my ($self) = @_;
return $self->{ '_underline' };
sub get_bold
my ($self) = @_;
return $self->{ '_bold' };
sub get_italic
my ($self) = @_;
return $self->{ '_italic' };
# Support functions
sub GetNodeAttr
my ($node, $attr) = @_;
return ($node->att($attr) ? $node->att($attr) : "");
sub SetNodeAttr
my ($node, $attr, $value) = @_;
$node->set_att($attr, $value);
sub GetNodeText
my ($node) = @_;
return $node->text;
sub SetNodeText
my ($node, $value) = @_;