lib/MESH/tree.rb in mesh-medical-subject-headings-2.2.1 vs lib/MESH/tree.rb in mesh-medical-subject-headings-2.3.0

- old
+ new

@@ -9,10 +9,11 @@ @headings = [] @by_unique_id = {} @by_tree_number = {} @by_original_heading = {} @by_entry = {} + @by_entry_word = Hash.new { |h, k| h[k] = Set.new } @locales = [@@default_locale] filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__) gzipped_file = File.open(filename) file = Zlib::GzipReader.new(gzipped_file) @@ -40,21 +41,30 @@ def add_heading_to_hashes(mh) @headings << mh @by_unique_id[mh.unique_id] = mh @by_original_heading[mh.original_heading] = mh + add_heading_by_entry_word(mh, mh.original_heading) mh.tree_numbers.each do |tree_number| raise if @by_tree_number[tree_number] @by_tree_number[tree_number] = mh end match_headings = mh.entries.map { |e| entry_match_key(e) }.uniq match_headings.each do |entry| raise if @by_entry[entry] @by_entry[entry] = mh + add_heading_by_entry_word(mh, entry) end end + def add_heading_by_entry_word(mh, entry) + entry.split.each do |word| + word.downcase! + @by_entry_word[word] << mh + end + end + def entry_match_key(e) e.strip.upcase end def load_translation(locale) @@ -78,11 +88,14 @@ entries.uniq! if heading = find(unique_id) heading.set_original_heading(original_heading, locale) unless original_heading.nil? heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil? heading.set_summary(summary, locale) unless summary.nil? - entries.each { |entry| heading.entries(locale) << entry } + entries.each do |entry| + heading.entries(locale) << entry + add_heading_by_entry_word(heading, entry) + end end entries = [] original_heading = nil summary = nil @@ -185,10 +198,14 @@ def find_by_entry(entry) return @by_entry[entry_match_key(entry)] end + def find_by_entry_word(word) + return @by_entry_word[word] + end + def where(conditions) matches = [] @headings.each do |heading| matches << heading if heading.matches(conditions) end @@ -202,33 +219,38 @@ end def match_in_text(text) return [] if text.nil? downcased = text.downcase + candidate_headings = Set.new + downcased.split(/\W+/).uniq.each do |word| + candidate_headings.merge(find_by_entry_word(word)) + end matches = [] - @headings.each do |heading| + candidate_headings.each do |heading| next unless heading.useful @locales.each do |locale| heading.entries(locale).each do |entry| if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster if /^[A-Z0-9]+$/ =~ entry regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/ else regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i end text.to_enum(:scan, regex).map do |m,| - matches << {heading: heading, matched: entry, index: $`.size} + match = Regexp.last_match + matches << {heading: heading, matched: entry, index: match.offset(0)} end end end end end confirmed_matches = [] matches.combination(2) do |l, r| - if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length) + if (r[:index][0] >= l[:index][0]) && (r[:index][1] <= l[:index][1]) #r is within l r[:delete] = true - elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length) + elsif (l[:index][0] >= r[:index][0]) && (l[:index][1] <= r[:index][1]) #l is within r l[:delete] = true end end matches.delete_if { |match| match[:delete] } \ No newline at end of file