lib/MESH/tree.rb in mesh-medical-subject-headings-2.2.1 vs lib/MESH/tree.rb in mesh-medical-subject-headings-2.3.0
- old
+ new
@@ -9,10 +9,11 @@
@headings = []
@by_unique_id = {}
@by_tree_number = {}
@by_original_heading = {}
@by_entry = {}
+ @by_entry_word = Hash.new { |h, k| h[k] = Set.new }
@locales = [@@default_locale]
filename = File.expand_path('../../../data/mesh_data_2014/d2014.bin.gz', __FILE__)
gzipped_file = File.open(filename)
file = Zlib::GzipReader.new(gzipped_file)
@@ -40,21 +41,30 @@
def add_heading_to_hashes(mh)
@headings << mh
@by_unique_id[mh.unique_id] = mh
@by_original_heading[mh.original_heading] = mh
+ add_heading_by_entry_word(mh, mh.original_heading)
mh.tree_numbers.each do |tree_number|
raise if @by_tree_number[tree_number]
@by_tree_number[tree_number] = mh
end
match_headings = mh.entries.map { |e| entry_match_key(e) }.uniq
match_headings.each do |entry|
raise if @by_entry[entry]
@by_entry[entry] = mh
+ add_heading_by_entry_word(mh, entry)
end
end
+ def add_heading_by_entry_word(mh, entry)
+ entry.split.each do |word|
+ word.downcase!
+ @by_entry_word[word] << mh
+ end
+ end
+
def entry_match_key(e)
e.strip.upcase
end
def load_translation(locale)
@@ -78,11 +88,14 @@
entries.uniq!
if heading = find(unique_id)
heading.set_original_heading(original_heading, locale) unless original_heading.nil?
heading.set_natural_language_name(natural_language_name, locale) unless natural_language_name.nil?
heading.set_summary(summary, locale) unless summary.nil?
- entries.each { |entry| heading.entries(locale) << entry }
+ entries.each do |entry|
+ heading.entries(locale) << entry
+ add_heading_by_entry_word(heading, entry)
+ end
end
entries = []
original_heading = nil
summary = nil
@@ -185,10 +198,14 @@
def find_by_entry(entry)
return @by_entry[entry_match_key(entry)]
end
+ def find_by_entry_word(word)
+ return @by_entry_word[word]
+ end
+
def where(conditions)
matches = []
@headings.each do |heading|
matches << heading if heading.matches(conditions)
end
@@ -202,33 +219,38 @@
end
def match_in_text(text)
return [] if text.nil?
downcased = text.downcase
+ candidate_headings = Set.new
+ downcased.split(/\W+/).uniq.each do |word|
+ candidate_headings.merge(find_by_entry_word(word))
+ end
matches = []
- @headings.each do |heading|
+ candidate_headings.each do |heading|
next unless heading.useful
@locales.each do |locale|
heading.entries(locale).each do |entry|
if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster
if /^[A-Z0-9]+$/ =~ entry
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/
else
regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i
end
text.to_enum(:scan, regex).map do |m,|
- matches << {heading: heading, matched: entry, index: $`.size}
+ match = Regexp.last_match
+ matches << {heading: heading, matched: entry, index: match.offset(0)}
end
end
end
end
end
confirmed_matches = []
matches.combination(2) do |l, r|
- if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length)
+ if (r[:index][0] >= l[:index][0]) && (r[:index][1] <= l[:index][1])
#r is within l
r[:delete] = true
- elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length)
+ elsif (l[:index][0] >= r[:index][0]) && (l[:index][1] <= r[:index][1])
#l is within r
l[:delete] = true
end
end
matches.delete_if { |match| match[:delete] }
\ No newline at end of file