module Parser class Files NAMES_TO_RESOURCES = { /Alula_API_Documentation_([0-9]{4})-([0-9]{2})-([0-9]{2}).html/ => :reqresp, /Alula_Connect_Plus_API_Documentation_([0-9]{4})-([0-9]{2})-([0-9]{2}).html/ => :helix }.freeze class << self def load NAMES_TO_RESOURCES.each.each_with_object({}) do |(regex, category), coll| coll[category] = find_latest_file(regex) end end def find_latest_file(pattern) Dir.glob('./data/docs/*').grep(pattern).first end end end class Engine DELIMITER = 'hr' METHOD_TYPE_REGEX = /Method:|Resource:/ REST_METHOD_TYPE = /Resource:/ RPC_METHOD_TYPE = /Method:/ class << self # # Documentation is a flat structure. HRs split resources, and all elements # for each resource are siblings. This walks through the document and pulls out # data into a structured hash suitable for further parsing. # There are better ways to do this parsing. This is the quick, brute-force way. # Note, this implementation uses parse exceptions to indicate the end of parsing. # This will be fragile but works for now. def parse_doc(category, file) doc = Nokogiri::HTML(File.open(file).read) structured = [] datum_structure = { type: 'This will be rest or rpc', name: 'This will be a resource name', path: 'This will be the resource path', methods: 'This will be a list of supported methods', description: 'This will be a description of the resource', relationships: 'This will be the relationships', parameters: 'This will be the parameters', fields: 'This will be the fields' } current_datum = nil # Parse REST docs. doc.css('#mainContent').children.each do |node| next if node.name != DELIMITER structured << current_datum if current_datum current_datum = datum_structure.dup current_datum = parse_chunk(current_datum, node) rescue nil end # Parse RPC docs rpc_found = false current_datum = nil doc.css('#mainContent').children.each do |node| # Skip forward until we get to the RPC section unless rpc_found if node.name == 'h1' and node.text =~ /RPC API/ rpc_found = true else next end end next if node.name != DELIMITER structured << current_datum if current_datum current_datum = datum_structure.dup current_datum = parse_chunk(current_datum, node) rescue nil end structured end private def parse_chunk(current_datum, node) current_datum[:type] = find_type(node) current_datum[:name] = find_name(node).strip current_datum[:path] = find_path(node).strip current_datum[:description] = find_description(node).strip current_datum[:parameters] = find_parameters(node) if current_datum[:type] == :rest current_datum[:methods] = find_methods(node) current_datum[:relationships] = find_relationships(node) current_datum[:fields] = find_fields(node) else current_datum[:methods] = false current_datum[:relationships] = false current_datum[:fields] = false end current_datum end def find_type(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ METHOD_TYPE_REGEX if sibling.text =~ RPC_METHOD_TYPE return :rpc elsif sibling.text =~ REST_METHOD_TYPE return :rest end end def find_name(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ METHOD_TYPE_REGEX sibling.text.gsub(METHOD_TYPE_REGEX, '') end def find_path(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Base URL/ sibling = sibling.next_sibling until sibling.name == 'p' sibling.css('code').text end def find_methods(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Available methods/ sibling.css('code').map { |el| el.text } end def find_description(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Description/ sibling = sibling.next_sibling text = [] until sibling.name == 'h3' text << sibling.text sibling = sibling.next_sibling end text.join('') end def find_relationships(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Relationships/ case sibling.next_sibling.next_sibling.name when 'p' return false when 'table' return table_to_hash(sibling.next_sibling.next_sibling) else raise 'WTF nothing?' end end def find_parameters(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Parameters/ case sibling.next_sibling.next_sibling.name when 'p' return false when 'table' return table_to_hash(sibling.next_sibling.next_sibling) else raise 'WTF nothing?' end end def find_fields(node) sibling = node.next_sibling sibling = sibling.next_sibling until sibling.text =~ /Fields/ case sibling.next_sibling.next_sibling.name when 'p' return false when 'table' return table_to_hash(sibling.next_sibling.next_sibling) else raise 'WTF nothing?' end end # # Transform an HTML table with headers into an array of objects. # Object keys are table header text, values are cell value text def table_to_hash(node) headers = node.css('thead th').map { |el| el.text.downcase } node.css('tbody tr').each_with_object([]) do |row, coll| vals = row.css('td').map { |c| c.text } coll << headers.each_with_index.each_with_object({}) do |(header, index), collector| collector[header] = vals[index] end end end end end end