require 'hpricot/htmlinfo'
def Hpricot(input, opts = {})
Hpricot.parse(input, opts)
end
module Hpricot
# Hpricot.parse parses input and return a document tree.
# represented by Hpricot::Doc.
def Hpricot.parse(input, opts = {})
Doc.new(make(input, opts))
end
# :stopdoc:
def Hpricot.make(input, opts = {})
opts = {:fixup_tags => false}.merge(opts)
stack = [[nil, nil, [], [], [], []]]
Hpricot.scan(input) do |token|
if stack.last[5] == :CDATA and !(token[0] == :etag and token[1].downcase == stack.last[0])
token[0] = :text
token[1] = token[3] if token[3]
end
case token[0]
when :stag
stagname = token[0] = token[1].downcase
if ElementContent[stagname] == :EMPTY
token[0] = :emptytag
stack.last[2] << token
else
if opts[:fixup_tags]
# obey the tag rules set up by the current element
if ElementContent.has_key? stagname
trans = nil
(stack.length-1).downto(0) do |i|
untags = stack[i][5]
break unless untags.include? stagname
# puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
trans = i
end
if trans.to_i > 1
eles = stack.slice!(trans..-1)
stack.last[2] += eles
# puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
end
end
end
# setup tag rules for inside this element
if ElementContent[stagname] == :CDATA
uncontainable_tags = :CDATA
elsif opts[:fixup_tags]
possible_tags = ElementContent[stagname]
excluded_tags, included_tags = stack.last[3..4]
if possible_tags
excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
included_tags = included_tags | (ElementInclusions[stagname] || [])
containable_tags = (possible_tags | included_tags) - excluded_tags
uncontainable_tags = ElementContent.keys - containable_tags
else
# If the tagname is unknown, it is assumed that any element
# except excluded can be contained.
uncontainable_tags = excluded_tags
end
end
stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
end
when :etag
etagname = token[0] = token[1].downcase
matched_elem = nil
(stack.length-1).downto(0) do |i|
stagname, = stack[i]
if stagname == etagname
matched_elem = stack[i]
stack[i][1] += token
eles = stack.slice!((i+1)..-1)
stack.last[2] += eles
break
end
end
unless matched_elem
stack.last[2] << [:bogus_etag, token]
else
ele = stack.pop
stack.last[2] << ele
end
when :text
l = stack.last[2].last
if l and l[0] == :text
l[1] += token[1]
else
stack.last[2] << token
end
else
stack.last[2] << token
end
end
while 1 < stack.length
ele = stack.pop
stack.last[2] << ele
end
structure_list = stack[0][2]
structure_list.map {|s| build_node(s) }
end
def Hpricot.fix_element(elem, excluded_tags, included_tags)
tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
children = elem[2]
if eraw
elem[2] = fix_structure_list(children)
return elem, []
else
if ElementContent[tagname] == :EMPTY
elem[2] = []
return elem, children
else
if ElementContent[tagname] == :CDATA
possible_tags = []
else
possible_tags = ElementContent[tagname]
end
if possible_tags
excluded_tags2 = ElementExclusions[tagname]
included_tags2 = ElementInclusions[tagname]
excluded_tags |= excluded_tags2 if excluded_tags2
included_tags |= included_tags2 if included_tags2
containable_tags = (possible_tags | included_tags) - excluded_tags
uncontainable_tags = ElementContent.keys - containable_tags
else
# If the tagname is unknown, it is assumed that any element
# except excluded can be contained.
uncontainable_tags = excluded_tags
end
fixed_children = []
rest = children
until rest.empty?
if String === rest[0][0]
elem = rest.shift
elem_tagname = elem[0]
elem_tagname = elem_tagname.downcase
if uncontainable_tags.include? elem_tagname
rest.unshift elem
break
else
fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
fixed_children << fixed_elem
rest = rest2 + rest
end
else
fixed_children << rest.shift
end
end
elem[2] = fixed_children
return elem, rest
end
end
end
def Hpricot.build_node(structure)
case structure[0]
when String
tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
children = structure[2]
etag = eraw && ETag.parse(tagname, eraw)
stag = STag.parse(tagname, attrs, sraw, true)
if !children.empty? || etag
Elem.new(stag,
children.map {|c| build_node(c) },
etag)
else
Elem.new(stag)
end
when :text
Text.parse_pcdata(structure[1])
when :emptytag
Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
when :bogus_etag
BogusETag.parse(structure[1], structure[2])
when :xmldecl
XMLDecl.parse(structure[2], structure[3])
when :doctype
DocType.parse(structure[1], structure[2], structure[3])
when :procins
ProcIns.parse(structure[1], structure[2], structure[3])
when :comment
Comment.parse(structure[1])
when :cdata_content
Text.parse_cdata_content(structure[1])
when :cdata
Text.parse_cdata_section(structure[1])
else
raise Exception, "[bug] unknown structure: #{structure.inspect}"
end
end
def STag.parse(qname, attrs, raw_string, is_stag)
result = STag.new(qname, attrs)
result.raw_string = raw_string
result
end
def ETag.parse(qname, raw_string)
result = self.new(qname)
result.raw_string = raw_string
result
end
def BogusETag.parse(qname, raw_string)
result = self.new(qname)
result.raw_string = raw_string
result
end
def Text.parse_pcdata(raw_string)
result = Text.new(raw_string)
result.raw_string = raw_string
result
end
def Text.parse_cdata_content(raw_string)
result = Text.new(raw_string)
result.raw_string = raw_string
result.instance_variable_set( "@cdata", true )
result
end
def Text.parse_cdata_section(content)
result = Text.new(content)
result.raw_string = ""
result
end
def XMLDecl.parse(attrs, raw_string)
attrs ||= {}
version = attrs['version']
encoding = attrs['encoding']
case attrs['standalone']
when 'yes'
standalone = true
when 'no'
standalone = false
else
standalone = nil
end
result = XMLDecl.new(version, encoding, standalone)
result.raw_string = raw_string
result
end
def DocType.parse(root_element_name, attrs, raw_string)
if attrs
public_identifier = attrs['public_id']
system_identifier = attrs['system_id']
end
root_element_name = root_element_name.downcase
result = DocType.new(root_element_name, public_identifier, system_identifier)
result.raw_string = raw_string
result
end
def ProcIns.parse(target, content, raw_string)
result = ProcIns.new(target, content)
result.raw_string = raw_string
result
end
def Comment.parse(content)
result = Comment.new(content)
result.raw_string = ""
result
end
module Pat
NameChar = /[-A-Za-z0-9._:]/
Name = /[A-Za-z_:]#{NameChar}*/
Nmtoken = /#{NameChar}+/
end
# :startdoc:
end