package Omni::Omnipage; # Configuration use strict; # Local libraries use Omni::Config; use Omni::Omnidd; use Omni::Omnicol; use Omni::Omniframe; # Extern libraries use XML::Twig; use XML::Parser; # Global variables my $tag_list = $Omni::Config::tag_list; my $att_list = $Omni::Config::att_list; my $obj_list = $Omni::Config::obj_list; # Temporary variables my $tmp_content = undef; my @tmp_objs = (); ### # A page object in Omnipage xml: a page contains zero or many collums # # Do Hoang Nhat Huy, 09 Jan 2011 ### # Initialization sub new { my ($class) = @_; # Page: a page can have many columns, many tables, or many images my @objs = (); # Class members my $self = { '_self' => $obj_list->{ 'OMNIPAGE' }, '_raw' => undef, '_content' => undef, '_objs' => \@objs }; bless $self, $class; return $self; } # sub set_raw { my ($self, $raw) = @_; # Save the raw xml ... $self->{ '_raw' } = $raw; # Parse the raw string my $twig_roots = { $tag_list->{ 'PAGE' } => 1 }; my $twig_handlers = { $tag_list->{ 'PAGE' } => \&parse}; # XML::Twig my $twig = new XML::Twig( twig_roots => $twig_roots, twig_handlers => $twig_handlers, pretty_print => 'indented' ); # Start the XML parsing $twig->parse($raw); $twig->purge; # Copy information from temporary variables to class members # Copy all columns @{$self->{ '_objs' } } = @tmp_objs; # Copy content $self->{ '_content' } = $tmp_content; } sub get_raw { my ($self) = @_; return $self->{ '_raw' }; } sub parse { my ($twig, $node) = @_; # At first, content is blank $tmp_content = ""; # because there's no columnm, table or image @tmp_objs = (); # Get node attributes # At version 16, Omnipage page does not have any interesting atribute my $child = undef; # Get the body text $child = $node->first_child( $tag_list->{ 'BODY' } ); # Page with no body, return if (! defined $child) { return; } # Get the first child in the body text $child = $child->first_child(); # The child of is usually
but it's not always the case my $section_tag = $tag_list->{ 'SECTION' }; #
, are usually not the children but the # desendents of but I'm not sure about this my $dd_tag = $tag_list->{ 'DD' }; my $column_tag = $tag_list->{ 'COL' }; my $frame_tag = $tag_list->{ 'FRAME' }; # Check if there's any column or dd, what the heck is dd while (defined $child) { my $xpath = $child->path(); # if this child is
, then and
tag are grandchild of if ($xpath =~ m/\/$section_tag$/) { # Get the first grand child my $grand_child = $child->first_child(); # Subloop while (defined $grand_child) { my $grand_xpath = $grand_child->path(); # if this child is if ($grand_xpath =~ m/\/$column_tag$/) { my $column = new Omni::Omnicol(); # Set raw content $column->set_raw($grand_child->sprint()); # Update column list push @tmp_objs, $column; # Update content $tmp_content = $tmp_content . $column->get_content() . "\n"; } # if this child is
elsif ($grand_xpath =~ m/\/$dd_tag$/) { my $dd = new Omni::Omnidd(); # Set raw content $dd->set_raw($child->sprint()); # Update column list push @tmp_objs, $dd; # Update content $tmp_content = $tmp_content . $dd->get_content() . "\n"; } # if this child is elsif ($xpath =~ m/\/$frame_tag$/) { my $frame = new Omni::Omniframe(); # Set raw content $frame->set_raw($child->sprint()); # Update column list push @tmp_objs, $frame; # Update content $tmp_content = $tmp_content . $frame->get_content() . "\n"; } # Little brother if ($grand_child->is_last_child) { last; } else { $grand_child = $grand_child->next_sibling(); } } } # if this child is elsif ($xpath =~ m/\/$column_tag$/) { my $column = new Omni::Omnicol(); # Set raw content $column->set_raw($child->sprint()); # Update column list push @tmp_objs, $column; # Update content $tmp_content = $tmp_content . $column->get_content() . "\n"; } # if this child is
elsif ($xpath =~ m/\/$dd_tag$/) { my $dd = new Omni::Omnidd(); # Set raw content $dd->set_raw($child->sprint()); # Update column list push @tmp_objs, $dd; # Update content $tmp_content = $tmp_content . $dd->get_content() . "\n"; } # if this child is elsif ($xpath =~ m/\/$frame_tag$/) { my $frame = new Omni::Omniframe(); # Set raw content $frame->set_raw($child->sprint()); # Update column list push @tmp_objs, $frame; # Update content $tmp_content = $tmp_content . $frame->get_content() . "\n"; } # Little brother if ($child->is_last_child) { last; } else { $child = $child->next_sibling(); } } } sub get_name { my ($self) = @_; return $self->{ '_self' }; } sub get_objs_ref { my ($self) = @_; return $self->{ '_objs' }; } sub get_content { my ($self) = @_; return $self->{ '_content' }; } # Support functions sub GetNodeAttr { my ($node, $attr) = @_; return ($node->att($attr) ? $node->att($attr) : ""); } sub SetNodeAttr { my ($node, $attr, $value) = @_; $node->set_att($attr, $value); } sub GetNodeText { my ($node) = @_; return $node->text; } sub SetNodeText { my ($node, $value) = @_; $node->set_text($value); } 1;