package Omni::Omniframe; # Configuration use strict; # Local libraries use Omni::Config; use Omni::Omnipara; use Omni::Omnitable; # Extern libraries use XML::Twig; use XML::Parser; # Global variables my $tag_list = $Omni::Config::tag_list; my $att_list = $Omni::Config::att_list; my $obj_list = $Omni::Config::obj_list; ### # A frame object in Omnipage xml: a frame contains paragraphs # (this is my observation and can be invalid under close scrunity # of new "evidence") # # Do Hoang Nhat Huy, 23 Feb 2011 ### # Initialization sub new { my ($class) = @_; # Objs: paragraphs my @objs = (); # Class members my $self = { '_self' => $obj_list->{ 'OMNIFRAME' }, '_raw' => undef, '_content' => undef, '_bottom' => undef, '_top' => undef, '_left' => undef, '_right' => undef, '_objs' => \@objs }; bless $self, $class; return $self; } sub set_raw { my ($self, $raw) = @_; # Save the raw xml ... $self->{ '_raw' } = $raw; # Parse the raw string my $twig_roots = { $tag_list->{ 'FRAME' } => 1 }; my $twig_handlers = { $tag_list->{ 'FRAME' } => sub { parse(@_, \$self); } }; # XML::Twig my $twig = new XML::Twig( twig_roots => $twig_roots, twig_handlers => $twig_handlers, pretty_print => 'indented' ); # Start the XML parsing $twig->parse($raw, \$self); $twig->purge; } sub get_raw { my ($self) = @_; return $self->{ '_raw' }; } sub parse { my ($twig, $node, $self) = @_; # At first, content is blank my $tmp_content = ""; # because there's no object my @tmp_objs = (); # Get node attributes my $tmp_bottom = GetNodeAttr($node, $att_list->{ 'BOTTOM' }); my $tmp_top = GetNodeAttr($node, $att_list->{ 'TOP' }); my $tmp_left = GetNodeAttr($node, $att_list->{ 'LEFT' }); my $tmp_right = GetNodeAttr($node, $att_list->{ 'RIGHT' }); # Check if there's any paragraph, dd, table, or picture # The large number of possible children is due to the # ambiguous structure of the Omnipage XML my $para_tag = $tag_list->{ 'PARA' }; my $table_tag = $tag_list->{ 'TABLE' }; # Get the first child in the body text my $child = $node->first_child(); while (defined $child) { my $xpath = $child->path(); # if this child is if ($xpath =~ m/\/$para_tag$/) { my $para = new Omni::Omnipara(); # Set raw content $para->set_raw($child->sprint()); # Update paragraph list push @tmp_objs, $para; # Update content $tmp_content = $tmp_content . $para->get_content() . "\n"; } elsif ($xpath =~ m/\/$table_tag$/) { my $table = new Omni::Omnitable(); # Set raw content $table->set_raw($child->sprint()); # Update paragraph list push @tmp_objs, $table; # Update content $tmp_content = $tmp_content . $table->get_content() . "\n"; } # Little brother if ($child->is_last_child) { last; } else { $child = $child->next_sibling(); } } # Copy information from temporary variables to class members $$self->{ '_bottom' } = $tmp_bottom; $$self->{ '_top' } = $tmp_top; $$self->{ '_left' } = $tmp_left; $$self->{ '_right' } = $tmp_right; # Copy all objects @{$$self->{ '_objs' } } = @tmp_objs; # Copy content $$self->{ '_content' } = $tmp_content; } sub get_name { my ($self) = @_; return $self->{ '_self' }; } sub get_objs_ref { my ($self) = @_; return $self->{ '_objs' }; } sub get_content { my ($self) = @_; return $self->{ '_content' }; } sub get_bottom_pos { my ($self) = @_; return $self->{ '_bottom' }; } sub get_top_pos { my ($self) = @_; return $self->{ '_top' }; } sub get_left_pos { my ($self) = @_; return $self->{ '_left' }; } sub get_right_pos { my ($self) = @_; return $self->{ '_right' }; } # Support functions sub GetNodeAttr { my ($node, $attr) = @_; return ($node->att($attr) ? $node->att($attr) : ""); } sub SetNodeAttr { my ($node, $attr, $value) = @_; $node->set_att($attr, $value); } sub GetNodeText { my ($node) = @_; return $node->text; } sub SetNodeText { my ($node, $value) = @_; $node->set_text($value); } 1;