lib/MESH/mesh.rb in mesh-medical-subject-headings-1.2.2 vs lib/MESH/mesh.rb in mesh-medical-subject-headings-1.3.0

- old
+ new

@@ -1,75 +1,81 @@ -require_relative 'translator' - module MESH class Mesh - attr_accessor :unique_id, :original_heading, :tree_numbers, :parents, :children, :natural_language_name, :summary, :entries, :useful + include Comparable + attr_accessor :unique_id, :tree_numbers, :roots, :parents, :children, :useful, :descriptor_class - def original_heading(locale = nil) - return @original_heading if locale.nil? - @@translator.translate(@original_heading) + def <=> other + self.unique_id <=> other.unique_id end - def natural_language_name(locale = nil) - return @natural_language_name if locale.nil? - @@translator.translate(@natural_language_name) + def original_heading(locale = @@default_locale) + return @original_heading[locale] end - def summary(locale = nil) - return @summary if locale.nil? - @@translator.translate(@summary) + def natural_language_name(locale = @@default_locale) + return @natural_language_name[locale] end - def entries(locale = nil) - return @entries if locale.nil? - @entries.map { |entry| @@translator.translate(entry) }.sort + def summary(locale = @@default_locale) + return @summary[locale] end + def entries(locale = @@default_locale) + @entries[locale] ||= [] + return @entries[locale] + end + def self.configure(args) return if @@configured raise ArgumentError.new('MeshHeadingGraph requires a filename in order to configure itself') unless not args[:filename].nil? + gzipped_file = File.open(args[:filename]) file = Zlib::GzipReader.new(gzipped_file) + current_heading = Mesh.new file.each_line do |line| - if line.match(/^\*NEWRECORD$/) #Then store the previous record before continuing - unless current_heading.unique_id.nil? - current_heading.entries.sort! - @@headings << current_heading - @@by_unique_id[current_heading.unique_id] = current_heading - @@by_original_heading[current_heading.original_heading] = current_heading - current_heading.tree_numbers.each do |tree_number| - @@by_tree_number[tree_number] = current_heading + + case + + when matches = line.match(/^\*NEWRECORD$/) + unless current_heading.unique_id.nil? + current_heading.entries.sort! + @@headings << current_heading + @@by_unique_id[current_heading.unique_id] = current_heading + @@by_original_heading[current_heading.original_heading] = current_heading + current_heading.tree_numbers.each do |tree_number| + @@by_tree_number[tree_number] = current_heading + end end - end - current_heading = Mesh.new - end + current_heading = Mesh.new - matches = line.match(/^UI = (.*)/) - current_heading.unique_id = matches[1] unless matches.nil? + when matches = line.match(/^UI = (.*)/) + current_heading.unique_id = matches[1] - matches = line.match(/^MN = (.*)/) - current_heading.tree_numbers << matches[1] unless matches.nil? + when matches = line.match(/^MN = (.*)/) + current_heading.tree_numbers << matches[1] + current_heading.roots << matches[1][0] unless current_heading.roots.include?(matches[1][0]) - matches = line.match(/^MS = (.*)/) - current_heading.summary = matches[1] unless matches.nil? + when matches = line.match(/^MS = (.*)/) + current_heading.set_summary(matches[1]) - matches = line.match(/^MH = (.*)/) - unless matches.nil? - mh = matches[1] - current_heading.original_heading = mh - current_heading.natural_language_name = mh - current_heading.entries << mh - librarian_parts = mh.match(/(.*), (.*)/) - current_heading.natural_language_name = "#{librarian_parts[2]} #{librarian_parts[1]}" unless librarian_parts.nil? - end + when matches = line.match(/^DC = (.*)/) + current_heading.descriptor_class = @@descriptor_classes[matches[1].to_i] - matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/) - unless matches.nil? - mh = matches[1].chomp - current_heading.entries << mh + when matches = line.match(/^MH = (.*)/) + mh = matches[1] + current_heading.set_original_heading(mh) + current_heading.entries << mh + librarian_parts = mh.match(/(.*), (.*)/) + nln = librarian_parts.nil? ? mh : "#{librarian_parts[2]} #{librarian_parts[1]}" + current_heading.set_natural_language_name(nln) + + when matches = line.match(/^(?:PRINT )?ENTRY = ([^|]+)/) + entry = matches[1].chomp + current_heading.entries << entry + end end @@by_unique_id.each do |id, heading| @@ -86,10 +92,23 @@ end end @@configured = true end + def self.translate(locale, tr) + return if @@locales.include? locale + @@headings.each_with_index do |h, i| + h.set_original_heading(tr.translate(h.original_heading), locale) + h.set_natural_language_name(tr.translate(h.natural_language_name), locale) + h.set_summary(tr.translate(h.summary), locale) + h.entries.each { |entry| h.entries(locale) << tr.translate(entry) } + h.entries(locale).sort! + end + + @@locales << locale + end + def self.find(unique_id) raise 'MeshHeadingGraph.configure must be called before use' unless @@configured return @@by_unique_id[unique_id] end @@ -116,60 +135,134 @@ yield @@headings[i] if @@headings[i].useful end end def self.match_in_text(text) - text = text.downcase + return [] if text.nil? + downcased = text.downcase matches = [] @@headings.each do |heading| next unless heading.useful - heading.entries.each do |entry| - if text.include? entry.downcase #This is a looser check than the regex but much, much faster - regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i - if regex =~ text - matches << {heading: heading, matched: entry} + @@locales.each do |locale| + heading.entries(locale).each do |entry| + if downcased.include? entry.downcase #This is a looser check than the regex but much, much faster + if /^[A-Z0-9]+$/ =~ entry + regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/ + else + regex = /(^|\W)#{Regexp.quote(entry)}(\W|$)/i + end + text.to_enum(:scan, regex).map do |m,| + matches << {heading: heading, matched: entry, index: $`.size} + end end end end end - matches + confirmed_matches = [] + matches.combination(2) do |l, r| + if (r[:index] >= l[:index]) && (r[:index] + r[:matched].length <= l[:index] + l[:matched].length) + #r is within l + r[:delete] = true + elsif (l[:index] >= r[:index]) && (l[:index] + l[:matched].length <= r[:index] + r[:matched].length) + #l is within r + l[:delete] = true + end + end + matches.delete_if { |match| match[:delete] } end + def has_ancestor(heading) + return false if parents.empty? + return true if parents.include? heading + in_grandparents = parents.map { |p| p.has_ancestor(heading) } + return in_grandparents.include? true + end + + def has_descendant(heading) + return false if children.empty? + return true if children.include? heading + in_grandchildren = children.map { |p| p.has_descendant(heading) } + return in_grandchildren.include? true + end + + def sibling?(heading) + common_parents = parents & heading.parents + !common_parents.empty? + end + + def deepest_position(root = '') + return nil if tree_numbers.empty? + deepest_tree_number = tree_numbers.max_by { |tn| tn.start_with?(root) ? tn.length : 0 } + deepest_tree_number.split('.').length + end + + def shallowest_position + return nil if tree_numbers.empty? + shallowest_tree_number = tree_numbers.min_by { |tn| tn.length } + shallowest_tree_number.split('.').length + end + + def self.cluster(headings) + return headings + end + def matches(conditions) conditions.each do |field, pattern| field_content = self.send(field) if field_content.kind_of?(Array) return false unless field_content.find { |fc| pattern =~ fc } elsif field_content.is_a?(TrueClass) || field_content.is_a?(FalseClass) return false unless field_content == pattern + elsif field_content.is_a? Symbol + return field_content == pattern else return false unless pattern =~ field_content end end return true end def inspect - return "#{@unique_id}, #{@original_heading}" + return "#{unique_id}, #{original_heading}, [#{tree_numbers.join(',')}]" end + def set_original_heading(heading, locale = @@default_locale) + @original_heading[locale] = heading + end + + def set_natural_language_name(name, locale = @@default_locale) + @natural_language_name[locale] = name + end + + def set_summary(summary, locale = @@default_locale) + @summary[locale] = summary + end + private @@configured = false @@headings = [] @@by_unique_id = {} @@by_tree_number = {} @@by_original_heading = {} @@default_locale = 'en-US' - @@translator = Translator.new + @@locales = [@@default_locale] + @@us_to_gb = Translator.new(Translator.enus_to_engb) + @@descriptor_classes = [:make_array_start_at_1, :topical_descriptor, :publication_type, :check_tag, :geographic_descriptor] def initialize @useful = true @tree_numbers = [] + @roots = [] @parents = [] @children = [] - @entries = [] + @entries = {} + @entries[@@default_locale] = [] + @original_heading = {} + @natural_language_name = {} + @summary = {} end + end end #