revs-utils.rb in revs-utils-2.0.0

- old
+ new

@@ -3,37 +3,32 @@
 require "revs-utils/version"
 require "countries"
 require 'active_support/core_ext/string'
 require 'active_support/core_ext/hash'
 require 'csv'
-require 'chronic'
 
 PROJECT_ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
 
 
 REVS_LC_TERMS_FILENAME=File.join(PROJECT_ROOT,'files','revs-lc-marque-terms.obj')
 REVS_MANIFEST_HEADERS_FILEPATH = File.join(PROJECT_ROOT,'config',"manifest_headers.yml")
 REGISTER = "register"
 METADATA = "metadata"
-OPTIONAL = "metadata_optional"
-FORMATS = "known_formats"
 
+
 module Revs
   module Utils
     
       
       # a hash of LC Subject Heading terms and their IDs for linking for "Automobiles" http://id.loc.gov/authorities/subjects/sh85010201.html
       # this is cached and loaded from disk and deserialized back into a hash for performance reasons, then stored as a module
       # level constant so it can be reused throughout the pre-assembly run as a constant
-      #  This cached set of terms can be re-generated with "ruby bin/revs_lc_automobile_terms.rb"
+      #  This cached set of terms can be re-generated with "ruby devel/revs_lc_automobile_terms.rb"
       AUTOMOBILE_LC_TERMS= File.open(REVS_LC_TERMS_FILENAME,'rb'){|io| Marshal.load(io)} if File.exists?(REVS_LC_TERMS_FILENAME)
       REVS_MANIFEST_HEADERS_FILE = File.open(REVS_MANIFEST_HEADERS_FILEPATH)
       REVS_MANIFEST_HEADERS = YAML.load( REVS_MANIFEST_HEADERS_FILE)
       
-      def revs_known_formats
-        get_manifest_section(FORMATS)
-      end
       
       def get_manifest_section(section)
         return REVS_MANIFEST_HEADERS[section]
       end
       
@@ -69,102 +64,54 @@
         end
         
         sources = Array.new
         files.each do |file|
           file.each do |row|
-            #Make sure the sourceid and filename are the same
+            #Make sure the sourcid and filename are the same
             fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']]))
-            return false if ((row[get_manifest_section(REGISTER)['sourceid']] != fname) || ((/\s/ =~ row[get_manifest_section(REGISTER)['sourceid']].strip) != nil))  
+            return false if row[get_manifest_section(REGISTER)['sourceid']] != fname
             sources << row[get_manifest_section(REGISTER)['sourceid']]
-          end         
+          end
+          
+          
+         
         end
         return sources.uniq.size == sources.size
       
       end
-            
+      
+      
       #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in
       def valid_to_register(file_path)
+        
         file = read_csv_with_headers(file_path)
-        return check_valid_to_register(file)
-      end
-      
-      #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in.  
-      def valid_for_metadata(file_path)
-        file = read_csv_with_headers(file_path)
-        return check_headers(file)
-      end
-      
-      # pass in csv data and it will tell if you everything is safe to register based on having labels, unique sourceIDs and filenames matching sourceIDs
-      def check_valid_to_register(csv_data)
         #Make sure all the required headers are there
-        result1=result2=result3=result4=true
-        if not get_manifest_section(REGISTER).values-csv_data[0].keys == []
-          puts "missing headers required for registration"
-          result1=false
-        end
-        sources=Array.new
+        return false if not get_manifest_section(REGISTER).values-file[0].keys == []
+        
         #Make sure all files have entries for those required headers
-        csv_data.each do |row|
+        file.each do |row|
           get_manifest_section(REGISTER).keys.each do |header| # label should be there as a column but does not always need a value
-             if header.downcase !='label' && row[header].blank? 
-               puts "#{row[get_manifest_section(REGISTER)['sourceid']]} does not have a value for a required registration field"
-               result2=false
-             end
+            return false if header.downcase !='label' && row[header].blank? #Alternatively consider row[header].class != String or row[header].size <= 0
           end
-          fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']]))
-          if ((row[get_manifest_section(REGISTER)['sourceid']] != fname) || ((/\s/ =~ row[get_manifest_section(REGISTER)['sourceid']].strip) != nil))  
-            puts "#{row[get_manifest_section(REGISTER)['sourceid']]} does not match the filename or has a space in it"            
-            result3=false
-          end
-          sources << row[get_manifest_section(REGISTER)['sourceid']]
         end
-        result4 = (sources.uniq.size == sources.size)
-        unless result4
-          puts "sourceIDs are not all unique" 
-          puts sources.uniq.map { | e | [sources.count(e), e] }.select { | c, _ | c > 1 }.sort.reverse.map { | c, e | "#{e}: #{c}" } # show all non-unique sourceIDs and their frequency
-        end
-        return (result1 && result2 && result3 && result4)
-        
+       return true
       end
       
-      # looks at certain metadata fields in manifest to confirm validity (such as dates and formats)
-      def check_metadata(csv_data)
-        bad_rows=0
-        csv_data.each do |row|
-          valid_date=revs_is_valid_datestring?(row[get_manifest_section(METADATA)['year']] || row[get_manifest_section(METADATA)['date']])
-          valid_format=revs_is_valid_format?(row[get_manifest_section(METADATA)['format']])
-          unless (valid_date && valid_format)
-            bad_rows+=1 
-            puts "#{row[get_manifest_section(REGISTER)['sourceid']]} has a bad year/date or format"
-          end
-        end
-        return bad_rows
-      end
-      
-      # pass in csv data from a file read in and it will tell you if the headers are valid
-      def check_headers(csv_data)
-        
-        result1=result2=true
-        file_headers=csv_data[0].keys.reject(&:blank?).collect(&:downcase)
+      #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in.  
+      def valid_for_metadata(file_path)
+        file = read_csv_with_headers(file_path)
+        file_headers=file[0].keys
         #The file doesn't need to have all the metadata values, it just can't have headers that aren't used for metadata or registration
         if file_headers.include?('date') && file_headers.include?('year') # can't have both date and year 
-          puts "has both year and date columns"
-          result1=false
+          return false
+        elsif file_headers.include?('location') && file_headers.include?('state') && file_headers.include?('city') && file_headers.include?('country') # can't have both location and the specific fields
+          return false
+        else
+          return file_headers-get_manifest_section(METADATA).values-get_manifest_section(REGISTER).values == []
         end
-        if file_headers.include?('location') && file_headers.include?('state') && file_headers.include?('city') && file_headers.include?('country') # can't have both location and the specific fields
-          puts "has location column as well as specific state,city,country columns"
-          result2=false
-        end
-        extra_columns = file_headers-get_manifest_section(METADATA).values-get_manifest_section(REGISTER).values-get_manifest_section(OPTIONAL).values
-        has_extra_columns = (extra_columns == [])
-        puts "has unknown columns: #{extra_columns.join(', ')}" unless has_extra_columns
-        result3 = has_extra_columns
-        
-        return (result1 && result2 && result3)
-        
       end
-      
+
       def clean_collection_name(name)
         return "" if name.blank? || name.nil?
         name=name.to_s
         name.gsub!(/\A(the )/i,'')
         name.gsub!(/( of the revs institute)\z/i,'')
@@ -197,36 +144,23 @@
         end
 
         return row
       end
 
-      # checks to see if we have a valid format
-      def revs_is_valid_format?(format)
-        return true if format.nil? || format.blank?
-        formats=format.split("|").collect{|f| f.strip}
-        !formats.collect {|f| revs_known_formats.include?(f)}.uniq.include?(false)
-      end
-      
-      # check a single format and fix some common issues
       def revs_check_format(format)
         return revs_check_formats([format]).first
       end
       
-      # check the incoming array of formats and fix some common issues
+      # check the incoming format and fix some common issues
       def revs_check_formats(format)
         known_fixes = {"black-and-white negative"=>"black-and-white negatives",
                        "color negative"=>"color negatives",
                        "slides/color transparency"=>"color transparencies",
                        "color negatives/slides"=>"color negatives",
                        "black-and-white negative strips"=>"black-and-white negatives",
-                       "black and white"=>"black-and-white negatives",
-                       "black-and-white"=>"black-and-white negatives",                       
-                       "black and white negative"=>"black-and-white negatives",
-                       "black and white negatives"=>"black-and-white negatives",
                        "color transparency"=>"color transparencies",
-                       "slide"=>"slides",
-                       "color transparancies"=>"color transparencies"
+                       "slide"=>"slides"
                      }
         count = 0 
         format.each do |f|
           format[count] = known_fixes[f.downcase] || f.downcase
           count += 1
@@ -295,25 +229,15 @@
       # tell us if the string passed is a valid year
       def is_valid_year?(date_string,starting_year=1800)
         date_string.to_s.strip.scan(/\D/).empty? and (starting_year..Date.today.year).include?(date_string.to_i)
       end
 
-      # tell us if the incoming datestring supplied in the manifest column is a valid date, year or list of years
-      def revs_is_valid_datestring?(date_string)
-        return true if date_string.nil? || date_string.empty?
-        is_full_date=(get_full_date(date_string) != false)
-        is_year=!parse_years(date_string).empty?
-        return is_year || is_full_date
-      end
-      
-      # tell us if the string passed is in is a full date of the format M/D/YYYY or m-d-yyyy or m-d-yy or M/D/YY, and returns the date object if it is valid
+      # tell us if the string passed is in is a full date of the format M/D/YYYY, and returns the date object if it is valid
       def get_full_date(date_string)
         begin
-          return false if date_string.scan(/(-|\/)/).count < 2 # we need at least two / or - characters to count as a full date
-          date_obj=Chronic.parse(date_string).to_date
-          date_obj=date_obj.prev_year(100) if date_obj > Date.today # if the parsing yields a date in the future, this is a problem, so adjust back a century (due to this issue: http://stackoverflow.com/questions/27058068/ruby-incorrectly-parses-2-digit-year)
-          is_valid_year?(date_obj.year.to_s) ? date_obj : false
+          date_obj=Date.strptime(date_string.gsub('-','/').delete(' '), '%m/%d/%Y')
+          return (is_valid_year?(date_obj.year.to_s) ? date_obj : false)
         rescue
           false
         end
       end
 
@@ -327,18 +251,18 @@
           result=date_string.split(',')
         end
         years_to_add=[]
         result.each do |year|
 
-          if year.scan(/[1-2][0-9][0-9][0-9][-][0-9][0-9]/).size > 0 && year.size == 7 # if we have a year that looks like "1961-62" or "1961-73", lets deal with it turning it into [1961,1962] or [1961,1962,1963,1964,1965,1966,1967...etc]
+          if year.scan(/[1-2][0-9][0-9][0-9][-][0-9][0-9]/).size > 0 # if we have a year that looks like "1961-62" or "1961-73", lets deal with it turning it into [1961,1962] or [1961,1962,1963,1964,1965,1966,1967...etc]
             start_year=year[2..3]
             end_year=year[5..6]
             stem=year[0..1] 
             for n in start_year..end_year
               years_to_add << "#{stem}#{n}"
             end
-          elsif year.scan(/[1-2][0-9][0-9][0-9][-][1-9]/).size > 0 && year.size == 6 # if we have a year that lloks like "1961-2" or "1961-3", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
+          elsif year.scan(/[1-2][0-9][0-9][0-9][-][1-9]/).size > 0 # if we have a year that lloks like "1961-2" or "1961-3", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
             start_year=year[3..3]
             end_year=year[5..5]
             stem=year[0..2]
             for n in start_year..end_year
               years_to_add << "#{stem}#{n}"
@@ -349,10 +273,10 @@
             result.delete(year) # first delete the year itself from the list
             stem=year[0..2] # next get the stem, and expand into the whole decade
             %w{0 1 2 3 4 5 6 7 8 9}.each {|n| years_to_add << "#{stem}#{n}"} # add each year in that decade to the output array
           end
 
-          if year.scan(/[1-2][0-9][0-9][0-9][-][1-2][0-9][0-9][0-9]/).size > 0 && year.size == 9 # if we have a year that lloks like "1961-1962" or "1930-1955", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
+          if year.scan(/[1-2][0-9][0-9][0-9][-][1-2][0-9][0-9][0-9]/).size > 0 # if we have a year that lloks like "1961-1962" or "1930-1955", lets deal with it turning it into [1961,1962] or [1961,1962,1963]
             start_year=year[0..3]
             end_year=year[5..8]
             if end_year.to_i - start_year.to_i < 10 # let's only do the expansion if we don't have some really large date range, like "1930-1985" .. only ranges less than 9 years will be split into separate years
               for n in start_year..end_year
                 years_to_add << n