lib/MESH/tree.rb in mesh-medical-subject-headings-2.3.0 vs lib/MESH/tree.rb in mesh-medical-subject-headings-3.0.0

- old
+ new

@@ -1,128 +1,107 @@ module MESH class Tree @@default_locale = :en_us + @@sw = Clarifier::StopWords.new() def initialize - @headings = [] - @by_unique_id = {} - @by_tree_number = {} - @by_original_heading = {} - @by_entry = {} - @by_entry_word = Hash.new { |h, k| h[k] = Set.new } + @headings_last_position = -1 + @headings = GoogleHashDenseLongToRuby.new + @headings_by_unique_id = GoogleHashDenseLongToRuby.new + @headings_by_tree_number = GoogleHashDenseLongToRuby.new + @headings_by_original_heading = GoogleHashDenseLongToRuby.new + @entries_by_term = GoogleHashDenseLongToRuby.new + @entries_by_loose_match_term = GoogleHashDenseLongToRuby.new #case insensitive, no punctuation, normalised whitespace + # @entries_by_word = Hash.new { |h, k| h[k] = Set.new } + @entries_by_first_word = GoogleHashDenseLongToRuby.new + # @entries_by_first_word = Hash.new { |h, k| h[k] = Set.new } @locales = [@@default_locale] filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__) gzipped_file = File.open(filename) file = Zlib::GzipReader.new(gzipped_file) lines = [] file.each_line do |line| case - when line.match(/^\*NEWRECORD$/) + when line.start_with?('*NEWRECORD') unless lines.empty? mh = MESH::Heading.new(self, @@default_locale, lines) - add_heading_to_hashes(mh) + @headings_last_position += 1 + @headings[@headings_last_position] = mh + @headings_by_unique_id[mh.unique_id.hash] = mh + @headings_by_original_heading[mh.original_heading.hash] = mh + mh.tree_numbers.each do |tree_number| + hash = tree_number.hash + raise if @headings_by_tree_number[hash] + @headings_by_tree_number[hash] = mh + end + mh.structured_entries.each do |entry| + @entries_by_term[entry.term.hash] = entry + @entries_by_loose_match_term[entry.loose_match_term.hash] = entry + entry_words = entry.term.downcase.split(/\W+/) + hash = entry_words[0].hash + @entries_by_first_word[hash] ||= Set.new + @entries_by_first_word[hash] << entry + end lines = [line] end else lines << line end end - @headings.each do |heading| - heading.connect_to_parents - heading.connect_to_forward_references + (0..@headings_last_position).each do |i| + # @headings.each do |heading| + @headings[i].connect_to_parents + @headings[i].connect_to_forward_references end end - def add_heading_to_hashes(mh) - @headings << mh - @by_unique_id[mh.unique_id] = mh - @by_original_heading[mh.original_heading] = mh - add_heading_by_entry_word(mh, mh.original_heading) - mh.tree_numbers.each do |tree_number| - raise if @by_tree_number[tree_number] - @by_tree_number[tree_number] = mh - end - match_headings = mh.entries.map { |e| entry_match_key(e) }.uniq - match_headings.each do |entry| - raise if @by_entry[entry] - @by_entry[entry] = mh - add_heading_by_entry_word(mh, entry) - end - end - - def add_heading_by_entry_word(mh, entry) - entry.split.each do |word| - word.downcase! - @by_entry_word[word] << mh - end - end - - def entry_match_key(e) - e.strip.upcase - end - def load_translation(locale) return if @locales.include? locale filename = File.expand_path("../../../data/mesh_data_2014/d2014.#{locale}.bin.gz", __FILE__) gzipped_file = File.open(filename) file = Zlib::GzipReader.new(gzipped_file) - entries = [] - original_heading = nil - natural_language_name = nil - summary = nil unique_id = nil + lines = [] file.each_line do |line| case - when line.match(/^\*NEWRECORD$/) - unless unique_id.nil? - entries.sort! - entries.uniq! - if heading = find(unique_id) - heading.set_original_heading(original_heading, locale) unless original_heading.nil? - heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil? - heading.set_summary(summary, locale) unless summary.nil? - entries.each do |entry| - heading.entries(locale) << entry - add_heading_by_entry_word(heading, entry) + when line.start_with?('*NEWRECORD') + unless unique_id.nil? || lines.empty? + if heading = find_heading_by_unique_id(unique_id) + new_entries = heading.load_translation(lines, locale) + new_entries.each do |entry| + @entries_by_term[entry.term.hash] = entry + @entries_by_loose_match_term[entry.loose_match_term.hash] = entry + entry_words = entry.term.downcase.split(/\W+/) + hash = entry_words[0].hash + @entries_by_first_word[hash] ||= Set.new + @entries_by_first_word[hash] << entry end + else + raise 'Translation provided for missing header' end - entries = [] - original_heading = nil - summary = nil unique_id = nil + lines = [] end when matches = line.match(/^UI = (.*)/) unique_id = matches[1] - when matches = line.match(/^MS = (.*)/) - summary = matches[1] - - when matches = line.match(/^MH = (.*)/) - mh = matches[1] - original_heading = mh - entries << mh - librarian_parts = mh.match(/(.*), (.*)/) - natural_language_name = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}" - - when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/) - entry = matches[1].chomp - entries << entry - end + lines << line + end @locales << locale end def load_wikipedia @@ -137,13 +116,13 @@ case when line.match(/^\*NEWRECORD$/) unless unique_id.nil? - if heading = find(unique_id) + if heading = find_heading_by_unique_id(unique_id) wikipedia_links.each do |wl| - wl[:score] = (wl[:score].to_f / heading.entries.length.to_f).round(2) + wl[:score] = (wl[:score].to_f / heading.structured_entries.length.to_f).round(2) end heading.wikipedia_links = wikipedia_links end wikipedia_links = [] @@ -163,99 +142,98 @@ @wikipedia_loaded = true end def linkify_summaries &block - @headings.each do |h| + (0..@headings_last_position).each do |i| + h = @headings[i] + # @headings.each do |h| h.linkify_summary &block end end - # NO LONGER COVERED BY TESTS - # def translate(locale, tr) - # return if @locales.include? locale - # @headings.each_with_index do |h, i| - # h.set_original_heading(tr.translate(h.original_heading), locale) - # h.set_natural_language_name(tr.translate(h.natural_language_name), locale) - # h.set_summary(tr.translate(h.summary), locale) - # h.entries.each { |entry| h.entries(locale) << tr.translate(entry) } - # h.entries(locale).sort! - # end - # - # @locales << locale - # end + def find_heading_by_unique_id(unique_id) + return @headings_by_unique_id[unique_id.hash] + end - def find(unique_id) - return @by_unique_id[unique_id] + def find_heading_by_tree_number(tree_number) + return @headings_by_tree_number[tree_number.hash] end - def find_by_tree_number(tree_number) - return @by_tree_number[tree_number] + def find_heading_by_main_heading(heading) + return @headings_by_original_heading[heading.hash] end - def find_by_original_heading(heading) - return @by_original_heading[heading] + def find_entry_by_term(term) + return @entries_by_term[term.hash] end - def find_by_entry(entry) - return @by_entry[entry_match_key(entry)] + def find_entry_by_loose_match(term) + return @entries_by_loose_match_term[Entry.loose_match(term).hash] end - def find_by_entry_word(word) - return @by_entry_word[word] + def find_entries_by_word(word) + return @entries_by_first_word[word.hash] end def where(conditions) matches = [] - @headings.each do |heading| + (0..@headings_last_position).each do |i| + # @headings.each do |heading| + heading = @headings[i] matches << heading if heading.matches(conditions) end matches end def each - for i in 0 ... @headings.size + (0..@headings_last_position).each do |i| + # for i in 0 ... @headings.size yield @headings[i] if @headings[i].useful end end - def match_in_text(text) + def match_in_text (text) return [] if text.nil? downcased = text.downcase - candidate_headings = Set.new - downcased.split(/\W+/).uniq.each do |word| - candidate_headings.merge(find_by_entry_word(word)) + candidate_entries = [] + text_words = @@sw.clarify(downcased).split(/\W+/) + text_words.uniq! + text_words.each do |word| + entries_by_word = find_entries_by_word(word) + candidate_entries << entries_by_word.to_a end + candidate_entries.compact! + candidate_entries.flatten! + # candidate_entries.uniq! #30% in this uniq + candidate_entries.keep_if { |entry| entry.heading.useful } + # puts "\n\n****\n#{candidate_entries.length}\n*****\n\n" matches = [] - candidate_headings.each do |heading| - next unless heading.useful - @locales.each do |locale| - heading.entries(locale).each do |entry| - if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster - if /^[A-Z0-9]+$/ =~ entry - regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/ - else - regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i - end - text.to_enum(:scan, regex).map do |m,| - match = Regexp.last_match - matches << {heading: heading, matched: entry, index: match.offset(0)} - end - end - end - end + candidate_entries.each do |entry| + entry_matches = entry.match_in_text(text, downcased) + matches << entry_matches end - confirmed_matches = [] + + matches.compact! + matches.flatten! + matches.combination(2) do |l, r| if (r[:index][0] >= l[:index][0]) && (r[:index][1] <= l[:index][1]) #r is within l r[:delete] = true elsif (l[:index][0] >= r[:index][0]) && (l[:index][1] <= r[:index][1]) #l is within r l[:delete] = true end end matches.delete_if { |match| match[:delete] } + end + + private + + + def entry_match_key(e) + e.strip.upcase end end \ No newline at end of file