lib/revs-utils.rb in revs-utils-1.0.14 vs lib/revs-utils.rb in revs-utils-1.0.15

- old
+ new

@@ -12,12 +12,12 @@ REVS_LC_TERMS_FILENAME=File.join(PROJECT_ROOT,'files','revs-lc-marque-terms.obj') REVS_MANIFEST_HEADERS_FILEPATH = File.join(PROJECT_ROOT,'config',"manifest_headers.yml") REGISTER = "register" METADATA = "metadata" +FORMATS = "known_formats" - module Revs module Utils # a hash of LC Subject Heading terms and their IDs for linking for "Automobiles" http://id.loc.gov/authorities/subjects/sh85010201.html @@ -26,10 +26,13 @@ # This cached set of terms can be re-generated with "ruby devel/revs_lc_automobile_terms.rb" AUTOMOBILE_LC_TERMS= File.open(REVS_LC_TERMS_FILENAME,'rb'){|io| Marshal.load(io)} if File.exists?(REVS_LC_TERMS_FILENAME) REVS_MANIFEST_HEADERS_FILE = File.open(REVS_MANIFEST_HEADERS_FILEPATH) REVS_MANIFEST_HEADERS = YAML.load( REVS_MANIFEST_HEADERS_FILE) + def revs_known_formats + get_manifest_section(FORMATS) + end def get_manifest_section(section) return REVS_MANIFEST_HEADERS[section] end @@ -65,54 +68,73 @@ end sources = Array.new files.each do |file| file.each do |row| - #Make sure the sourcid and filename are the same + #Make sure the sourceid and filename are the same fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']])) return false if row[get_manifest_section(REGISTER)['sourceid']] != fname sources << row[get_manifest_section(REGISTER)['sourceid']] - end - - - + end end return sources.uniq.size == sources.size end - - + #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in def valid_to_register(file_path) - file = read_csv_with_headers(file_path) + return check_valid_to_register(file) + end + + #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in. + def valid_for_metadata(file_path) + file = read_csv_with_headers(file_path) + return check_headers(file) + end + + # pass in csv data and it will tell if you everything is safe to register based on having labels, unique sourceIDs and filenames matching sourceIDs + def check_valid_to_register(csv_data) #Make sure all the required headers are there - return false if not get_manifest_section(REGISTER).values-file[0].keys == [] - + return false if not get_manifest_section(REGISTER).values-csv_data[0].keys == [] + sources=Array.new #Make sure all files have entries for those required headers - file.each do |row| + csv_data.each do |row| get_manifest_section(REGISTER).keys.each do |header| # label should be there as a column but does not always need a value return false if header.downcase !='label' && row[header].blank? #Alternatively consider row[header].class != String or row[header].size <= 0 end + fname = row[get_manifest_section(REGISTER)['filename']].chomp(File.extname(row[get_manifest_section(REGISTER)['filename']])) + return false if row[get_manifest_section(REGISTER)['sourceid']] != fname + sources << row[get_manifest_section(REGISTER)['sourceid']] end - return true + return sources.uniq.size == sources.size end - #Pass this function a CSV file and it will return true if the proper headers are there and each entry has the required fields filled in. - def valid_for_metadata(file_path) - file = read_csv_with_headers(file_path) - file_headers=file[0].keys.reject(&:blank?).collect(&:downcase) + # looks at certain metadata fields in manifest to confirm validity (such as dates and formats) + def check_metadata(csv_data) + bad_rows=0 + csv_data.each do |row| + valid_date=revs_is_valid_datestring?(row[get_manifest_section(METADATA)['year']] || row[get_manifest_section(METADATA)['date']]) + valid_format=revs_is_valid_format?(row[get_manifest_section(METADATA)['format']]) + bad_rows+=1 unless (valid_date && valid_format) + end + return bad_rows + end + + # pass in csv data from a file read in and it will tell you if the headers are valid + def check_headers(csv_data) + file_headers=csv_data[0].keys.reject(&:blank?).collect(&:downcase) #The file doesn't need to have all the metadata values, it just can't have headers that aren't used for metadata or registration if file_headers.include?('date') && file_headers.include?('year') # can't have both date and year return false elsif file_headers.include?('location') && file_headers.include?('state') && file_headers.include?('city') && file_headers.include?('country') # can't have both location and the specific fields return false else return file_headers-get_manifest_section(METADATA).values-get_manifest_section(REGISTER).values == [] end end - + def clean_collection_name(name) return "" if name.blank? || name.nil? name=name.to_s name.gsub!(/\A(the )/i,'') name.gsub!(/( of the revs institute)\z/i,'') @@ -145,11 +167,17 @@ end return row end - # check a single format single and fix some common issues + # checks to see if we have a valid format + def revs_is_valid_format?(format) + formats=format.split("|").collect{|f| f.strip} + !formats.collect {|f| revs_known_formats.include?(f)}.uniq.include?(false) + end + + # check a single format and fix some common issues def revs_check_format(format) return revs_check_formats([format]).first end # check the incoming array of formats and fix some common issues @@ -235,9 +263,17 @@ # tell us if the string passed is a valid year def is_valid_year?(date_string,starting_year=1800) date_string.to_s.strip.scan(/\D/).empty? and (starting_year..Date.today.year).include?(date_string.to_i) end + # tell us if the incoming datestring supplied in the manifest column is a valid date, year or list of years + def revs_is_valid_datestring?(date_string) + return true if date_string.nil? || date_string.empty? + is_full_date=(get_full_date(date_string) != false) + is_year=!parse_years(date_string).empty? + return is_year || is_full_date + end + # tell us if the string passed is in is a full date of the format M/D/YYYY or m-d-yyyy or m-d-yy or M/D/YY, and returns the date object if it is valid def get_full_date(date_string) begin return false if date_string.scan(/(-|\/)/).count < 2 # we need at least two / or - characters to count as a full date date_obj=Chronic.parse(date_string).to_date