lib/bio-publisci/dataset/data_cube.rb in bio-publisci-0.0.2 vs lib/bio-publisci/dataset/data_cube.rb in bio-publisci-0.0.3

- old
+ new

@@ -1,70 +1,114 @@ #monkey patch to make rdf string w/ heredocs prettier ;) - class String - def unindent - gsub /^#{self[/\A\s*/]}/, '' - # gsub(/^#{scan(/^\s*/).min_by{|l|l.length}}/, "") - end +class String + def unindent + gsub /^#{self[/\A\s*/]}/, '' end +end module R2RDF - # used to generate data cube observations, data structure definitions, etc - module Dataset + class Dataset module DataCube + include R2RDF::Parser def defaults { type: :dataframe, encode_nulls: false, base_url: "http://www.rqtl.org", } - end - - def generate(measures, dimensions, codes, data, observation_labels, var, options={}) - dimensions = sanitize(dimensions) - codes = sanitize(codes) - measures = sanitize(measures) - var = sanitize([var]).first - data = sanitize_hash(data) + end - str = prefixes(var,options) - str << data_structure_definition((measures | dimensions), var, options) - str << dataset(var, options) - component_specifications(measures, dimensions, var, options).map{ |c| str << c } - dimension_properties(dimensions, codes, var, options).map{|p| str << p} - measure_properties(measures, var, options).map{|p| str << p} - code_lists(codes, data, var, options).map{|l| str << l} - concept_codes(codes, data, var, options).map{|c| str << c} - observations(measures, dimensions, codes, data, observation_labels, var, options).map{|o| str << o} - str - end - - def sanitize(array) - #remove spaces and other special characters - processed = [] - array.map{|entry| - if entry.is_a? String - processed << entry.gsub(/[\s\.]/,'_') + def generate_resources(measures, dimensions, codes, options={}) + newm = measures.map {|m| + if m =~ /^http:\/\// + "<#{m}>" + elsif m =~ /^[a-zA-z]+:[a-zA-z]+$/ + m else - processed << entry - end + "prop:#{m}" + end } - processed + + newc = [] + + newd = dimensions.map{|d| + if d =~ /^http:\/\// + # newc << "<#{d}>" if codes.include? d + "<#{d}>" + elsif d =~ /^[a-zA-z]+:[a-zA-z]+$/ + d + else + # newc << "prop:#{d}" if codes.include? d + "prop:#{d}" + end + } + + if codes.first.is_a? Array + newc = codes.map{|c| + c.map{|el| + if el =~ /^http:\/\// + "<#{el}>" + else + el + end + } + } + else + newc = codes.map{|c| + ["#{c}","code:#{c.downcase}","code:#{c.downcase.capitalize}"] + } + end + [newm, newd, newc] end - def sanitize_hash(h) - mappings = {} - h.keys.map{|k| - if(k.is_a? String) - mappings[k] = k.gsub(' ','_') + def encode_data(codes,data,var,options={}) + new_data = {} + data.map{|k,v| + if codes.include? k + new_data[k] = v.map{|val| + if val =~ /^http:\/\// + "<#{val}>" + elsif val =~ /^[a-zA-z]+:[a-zA-z]+$/ + val + else + "<code/#{k.downcase}/#{val}>" + end + } + else + new_data[k] = v end } + new_data + end - h.keys.map{|k| - h[mappings[k]] = h.delete(k) if mappings[k] - } + def vocabulary(vocab,options={}) + if vocab.is_a?(String) && vocab =~ /^http:\/\// + RDF::Vocabulary.new(vocab) + elsif RDF.const_defined? vocab.to_sym && RDF.const_get(vocab.to_sym).inspect =~ /^RDF::Vocabulary/ + RDF.const_get(vocab) + else + nil + end + end + + def generate(measures, dimensions, codes, data, observation_labels, var, options={}) + # dimensions = sanitize(dimensions) + # codes = sanitize(codes) + # measures = sanitize(measures) + var = sanitize([var]).first + data = sanitize_hash(data) - h + str = prefixes(var,options) + str << data_structure_definition(measures, dimensions, codes, var, options) + str << dataset(var, options) + # component_specifications(measures, dimensions, var, options).map{ |c| str << c } + dimension_properties(dimensions, codes, var, options).map{|p| str << p} + measure_properties(measures, var, options).map{|p| str << p} + code_lists(codes, data, var, options).map{|l| str << l} + concept_codes(codes, data, var, options).map{|c| str << c} + observations(measures, dimensions, codes, data, observation_labels, var, options).map{|o| str << o} + str end def prefixes(var, options={}) var = sanitize([var]).first options = defaults().merge(options) @@ -78,28 +122,32 @@ @prefix prop: <#{base}/dc/properties/> . @prefix dct: <http://purl.org/dc/terms/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . @prefix cs: <#{base}/dc/dataset/#{var}/cs/> . @prefix code: <#{base}/dc/dataset/#{var}/code/> . - @prefix class: <#{base}/dc/dataset/#{var}/class/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix org: <http://www.w3.org/ns/org#> . @prefix prov: <http://www.w3.org/ns/prov#> . EOF end - def data_structure_definition(components,var,options={}) + def data_structure_definition(measures,dimensions,codes,var,options={}) var = sanitize([var]).first options = defaults().merge(options) + rdf_measures, rdf_dimensions, rdf_codes = generate_resources(measures, dimensions, codes, options) + str = "ns:dsd-#{var} a qb:DataStructureDefinition;\n" - str << " qb:component\n" - components.map{|n| - str << " cs:#{n} ,\n" + rdf_dimensions.map{|d| + str << " qb:component [ qb:dimension #{d} ] ;\n" } + + rdf_measures.map{|m| + str << " qb:component [ qb:measure #{m} ] ;\n" + } str[-2]='.' str<<"\n" str end @@ -139,104 +187,142 @@ specs end def dimension_properties(dimensions, codes, var, options={}) options = defaults().merge(options) + rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], dimensions, codes, options) props = [] - - dimensions.map{|d| - if codes.include?(d) - props << <<-EOF.unindent - prop:#{d} a rdf:Property, qb:DimensionProperty ; - rdfs:label "#{d}"@en ; - qb:codeList code:#{d.downcase} ; - rdfs:range code:#{d.downcase.capitalize} . - EOF - else - props << <<-EOF.unindent - prop:#{d} a rdf:Property, qb:DimensionProperty ; - rdfs:label "#{d}"@en . + dimension_codes = rdf_codes.map{|c| + if c[0]=~/^<http:/ + c[0][1..-2] + else + c[0] + end + } - EOF - end + rdf_dimensions.each_with_index{|d,i| + if dimension_codes.include?(dimensions[i]) + code = rdf_codes[dimension_codes.index(dimensions[i])] + props << <<-EOF.unindent + #{d} a rdf:Property, qb:DimensionProperty ; + rdfs:label "#{strip_prefixes(strip_uri(d))}"@en ; + qb:codeList #{code[1]} ; + rdfs:range #{code[2]} . + + EOF + else + props << <<-EOF.unindent + #{d} a rdf:Property, qb:DimensionProperty ; + rdfs:label "#{strip_prefixes(strip_uri(d))}"@en . + + EOF + end } props end def measure_properties(measures, var, options={}) options = defaults().merge(options) + rdf_measures = generate_resources(measures, [], [], options)[0] props = [] - measures.map{ |m| + rdf_measures.map{ |m| props << <<-EOF.unindent - prop:#{m} a rdf:Property, qb:MeasureProperty ; - rdfs:label "#{m}"@en . + #{m} a rdf:Property, qb:MeasureProperty ; + rdfs:label "#{strip_prefixes(strip_uri(m))}"@en . EOF } props end def observations(measures, dimensions, codes, data, observation_labels, var, options={}) var = sanitize([var]).first options = defaults().merge(options) + rdf_measures, rdf_dimensions, rdf_codes = generate_resources(measures, dimensions, codes, options) + data = encode_data(codes, data, var, options) obs = [] + + dimension_codes = rdf_codes.map{|c| + if c[0]=~/^<http:/ + c[0][1..-2] + else + c[0] + end + } + observation_labels.each_with_index.map{|r, i| contains_nulls = false str = <<-EOF.unindent ns:obs#{r} a qb:Observation ; qb:dataSet ns:dataset-#{var} ; EOF str << " rdfs:label \"#{r}\" ;\n" unless options[:no_labels] - dimensions.map{|d| + dimensions.each_with_index{|d,j| contains_nulls = contains_nulls | (data[d][i] == nil) - if codes.include? d - str << " prop:#{d} <code/#{d.downcase}/#{data[d][i]}> ;\n" + + if dimension_codes.include? d + # str << " #{rdf_dimensions[j]} <code/#{d.downcase}/#{data[d][i]}> ;\n" + str << " #{rdf_dimensions[j]} #{to_resource(data[d][i], options)} ;\n" else - str << " prop:#{d} ns:#{to_resource(data[d][i], options)} ;\n" + str << " #{rdf_dimensions[j]} #{to_literal(data[d][i], options)} ;\n" end } - measures.map{|m| + measures.each_with_index{|m,j| contains_nulls = contains_nulls | (data[m][i] == nil) - str << " prop:#{m} #{to_literal(data[m][i], options)} ;\n" + str << " #{rdf_measures[j]} #{to_literal(data[m][i], options)} ;\n" } str << " .\n\n" - obs << str unless contains_nulls && !options[:encode_nulls] - + if contains_nulls && !options[:encode_nulls] + if options[:raise_nils] + raise "missing component for observation, skipping: #{str}, " + elsif options[:whiny_nils] + puts "missing component for observation, skipping: #{str}, " + end + else + obs << str + end } obs end def code_lists(codes, data, var, options={}) options = defaults().merge(options) + rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], [], codes, options) + data = encode_data(codes, data, var, options) lists = [] - codes.map{|code| + rdf_codes.map{|code| + if code[0] =~ /^<.+>$/ + refcode = code[0][1..-2] + else + refcode = code[0] + end str = <<-EOF.unindent - code:#{code.downcase.capitalize} a rdfs:Class, owl:Class; + #{code[2]} a rdfs:Class, owl:Class; rdfs:subClassOf skos:Concept ; - rdfs:label "Code list for #{code} - codelist class"@en; - rdfs:comment "Specifies the #{code} for each observation"; - rdfs:seeAlso code:#{code.downcase} . + rdfs:label "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist class"@en; + rdfs:comment "Specifies the #{strip_prefixes(strip_uri(code[1]))} for each observation"; + rdfs:seeAlso #{code[1]} . - code:#{code.downcase} a skos:ConceptScheme; - skos:prefLabel "Code list for #{code} - codelist scheme"@en; - rdfs:label "Code list for #{code} - codelist scheme"@en; - skos:notation "CL_#{code.upcase}"; - skos:note "Specifies the #{code} for each observation"; + #{code[1]} a skos:ConceptScheme; + skos:prefLabel "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist scheme"@en; + rdfs:label "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist scheme"@en; + skos:notation "CL_#{strip_prefixes(strip_uri(code[1])).upcase}"; + skos:note "Specifies the #{strip_prefixes(strip_uri(code[1]))} for each observation"; EOF - data[code].uniq.map{|value| + data[refcode].uniq.map{|value| unless value == nil && !options[:encode_nulls] - str << " skos:hasTopConcept <code/#{code.downcase}/#{to_resource(value,options)}> ;\n" + str << " skos:hasTopConcept #{to_resource(value,options)} ;\n" end } str << " .\n\n" lists << str @@ -246,63 +332,38 @@ lists end def concept_codes(codes, data, var, options={}) options = defaults().merge(options) + rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], [], codes, options) concepts = [] - codes.map{|code| - data[code].uniq.map{|value| + data = encode_data(codes, data, var, options) + rdf_codes.map{|code| + if code[0] =~ /^<.+>$/ + refcode = code[0][1..-2] + else + refcode = code[0] + end + data[refcode].uniq.each_with_index{|value,i| unless value == nil && !options[:encode_nulls] concepts << <<-EOF.unindent - <code/#{code.downcase}/#{to_resource(value,options)}> a skos:Concept, code:#{code.downcase.capitalize}; - skos:topConceptOf code:#{code.downcase} ; - skos:prefLabel "#{to_resource(value,options)}" ; - skos:inScheme code:#{code.downcase} . + #{to_resource(value,options)} a skos:Concept, #{code[2]}; + skos:topConceptOf #{code[1]} ; + skos:prefLabel "#{strip_uri(data[refcode][i])}" ; + skos:inScheme #{code[1]} . EOF end } } concepts end - def to_resource(obj, options) - if obj.is_a? String - #TODO decide the right way to handle missing values, since RDF has no null - #probably throw an error here since a missing resource is a bigger problem - obj = "NA" if obj.empty? - - #TODO remove special characters (faster) as well (eg '?') - obj.gsub(' ','_').gsub('?','') - elsif obj == nil && options[:encode_nulls] - '"NA"' - elsif obj.is_a? Numeric - #resources cannot be referred to purely by integer (?) - "n"+obj.to_s - else - obj - end - end - - def to_literal(obj, options) - if obj.is_a? String - # Depressing that there's no more elegant way to check if a string is - # a number... - if val = Integer(obj) rescue nil - val - elsif val = Float(obj) rescue nil - val - else - '"'+obj+'"' - end - elsif obj == nil && options[:encode_nulls] - #TODO decide the right way to handle missing values, since RDF has no null - '"NA"' - else - obj - end + def abbreviate_known(turtle_string) + #debug method + turtle_string.gsub(/<http:\/\/www\.rqtl\.org\/dc\/properties\/(\S+)>/, 'prop:\1').gsub(/<http:\/\/www.rqtl.org\/ns\/dc\/code\/(\S+)\/(\S+)>/, '<code/\1/\2>').gsub(/<http:\/\/www.rqtl.org\/dc\/dataset\/(\S+)\/code\/(\S+)>/, 'code:\2') end end end end