lib/bio-publisci/dataset/data_cube.rb in bio-publisci-0.0.2 vs lib/bio-publisci/dataset/data_cube.rb in bio-publisci-0.0.3
- old
+ new
@@ -1,70 +1,114 @@
#monkey patch to make rdf string w/ heredocs prettier ;)
- class String
- def unindent
- gsub /^#{self[/\A\s*/]}/, ''
- # gsub(/^#{scan(/^\s*/).min_by{|l|l.length}}/, "")
- end
+class String
+ def unindent
+ gsub /^#{self[/\A\s*/]}/, ''
end
+end
module R2RDF
- # used to generate data cube observations, data structure definitions, etc
- module Dataset
+ class Dataset
module DataCube
+ include R2RDF::Parser
def defaults
{
type: :dataframe,
encode_nulls: false,
base_url: "http://www.rqtl.org",
}
- end
-
- def generate(measures, dimensions, codes, data, observation_labels, var, options={})
- dimensions = sanitize(dimensions)
- codes = sanitize(codes)
- measures = sanitize(measures)
- var = sanitize([var]).first
- data = sanitize_hash(data)
+ end
- str = prefixes(var,options)
- str << data_structure_definition((measures | dimensions), var, options)
- str << dataset(var, options)
- component_specifications(measures, dimensions, var, options).map{ |c| str << c }
- dimension_properties(dimensions, codes, var, options).map{|p| str << p}
- measure_properties(measures, var, options).map{|p| str << p}
- code_lists(codes, data, var, options).map{|l| str << l}
- concept_codes(codes, data, var, options).map{|c| str << c}
- observations(measures, dimensions, codes, data, observation_labels, var, options).map{|o| str << o}
- str
- end
-
- def sanitize(array)
- #remove spaces and other special characters
- processed = []
- array.map{|entry|
- if entry.is_a? String
- processed << entry.gsub(/[\s\.]/,'_')
+ def generate_resources(measures, dimensions, codes, options={})
+ newm = measures.map {|m|
+ if m =~ /^http:\/\//
+ "<#{m}>"
+ elsif m =~ /^[a-zA-z]+:[a-zA-z]+$/
+ m
else
- processed << entry
- end
+ "prop:#{m}"
+ end
}
- processed
+
+ newc = []
+
+ newd = dimensions.map{|d|
+ if d =~ /^http:\/\//
+ # newc << "<#{d}>" if codes.include? d
+ "<#{d}>"
+ elsif d =~ /^[a-zA-z]+:[a-zA-z]+$/
+ d
+ else
+ # newc << "prop:#{d}" if codes.include? d
+ "prop:#{d}"
+ end
+ }
+
+ if codes.first.is_a? Array
+ newc = codes.map{|c|
+ c.map{|el|
+ if el =~ /^http:\/\//
+ "<#{el}>"
+ else
+ el
+ end
+ }
+ }
+ else
+ newc = codes.map{|c|
+ ["#{c}","code:#{c.downcase}","code:#{c.downcase.capitalize}"]
+ }
+ end
+ [newm, newd, newc]
end
- def sanitize_hash(h)
- mappings = {}
- h.keys.map{|k|
- if(k.is_a? String)
- mappings[k] = k.gsub(' ','_')
+ def encode_data(codes,data,var,options={})
+ new_data = {}
+ data.map{|k,v|
+ if codes.include? k
+ new_data[k] = v.map{|val|
+ if val =~ /^http:\/\//
+ "<#{val}>"
+ elsif val =~ /^[a-zA-z]+:[a-zA-z]+$/
+ val
+ else
+ "<code/#{k.downcase}/#{val}>"
+ end
+ }
+ else
+ new_data[k] = v
end
}
+ new_data
+ end
- h.keys.map{|k|
- h[mappings[k]] = h.delete(k) if mappings[k]
- }
+ def vocabulary(vocab,options={})
+ if vocab.is_a?(String) && vocab =~ /^http:\/\//
+ RDF::Vocabulary.new(vocab)
+ elsif RDF.const_defined? vocab.to_sym && RDF.const_get(vocab.to_sym).inspect =~ /^RDF::Vocabulary/
+ RDF.const_get(vocab)
+ else
+ nil
+ end
+ end
+
+ def generate(measures, dimensions, codes, data, observation_labels, var, options={})
+ # dimensions = sanitize(dimensions)
+ # codes = sanitize(codes)
+ # measures = sanitize(measures)
+ var = sanitize([var]).first
+ data = sanitize_hash(data)
- h
+ str = prefixes(var,options)
+ str << data_structure_definition(measures, dimensions, codes, var, options)
+ str << dataset(var, options)
+ # component_specifications(measures, dimensions, var, options).map{ |c| str << c }
+ dimension_properties(dimensions, codes, var, options).map{|p| str << p}
+ measure_properties(measures, var, options).map{|p| str << p}
+ code_lists(codes, data, var, options).map{|l| str << l}
+ concept_codes(codes, data, var, options).map{|c| str << c}
+ observations(measures, dimensions, codes, data, observation_labels, var, options).map{|o| str << o}
+ str
end
def prefixes(var, options={})
var = sanitize([var]).first
options = defaults().merge(options)
@@ -78,28 +122,32 @@
@prefix prop: <#{base}/dc/properties/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix cs: <#{base}/dc/dataset/#{var}/cs/> .
@prefix code: <#{base}/dc/dataset/#{var}/code/> .
- @prefix class: <#{base}/dc/dataset/#{var}/class/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix org: <http://www.w3.org/ns/org#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
EOF
end
- def data_structure_definition(components,var,options={})
+ def data_structure_definition(measures,dimensions,codes,var,options={})
var = sanitize([var]).first
options = defaults().merge(options)
+ rdf_measures, rdf_dimensions, rdf_codes = generate_resources(measures, dimensions, codes, options)
+
str = "ns:dsd-#{var} a qb:DataStructureDefinition;\n"
- str << " qb:component\n"
- components.map{|n|
- str << " cs:#{n} ,\n"
+ rdf_dimensions.map{|d|
+ str << " qb:component [ qb:dimension #{d} ] ;\n"
}
+
+ rdf_measures.map{|m|
+ str << " qb:component [ qb:measure #{m} ] ;\n"
+ }
str[-2]='.'
str<<"\n"
str
end
@@ -139,104 +187,142 @@
specs
end
def dimension_properties(dimensions, codes, var, options={})
options = defaults().merge(options)
+ rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], dimensions, codes, options)
props = []
-
- dimensions.map{|d|
- if codes.include?(d)
- props << <<-EOF.unindent
- prop:#{d} a rdf:Property, qb:DimensionProperty ;
- rdfs:label "#{d}"@en ;
- qb:codeList code:#{d.downcase} ;
- rdfs:range code:#{d.downcase.capitalize} .
- EOF
- else
- props << <<-EOF.unindent
- prop:#{d} a rdf:Property, qb:DimensionProperty ;
- rdfs:label "#{d}"@en .
+ dimension_codes = rdf_codes.map{|c|
+ if c[0]=~/^<http:/
+ c[0][1..-2]
+ else
+ c[0]
+ end
+ }
- EOF
- end
+ rdf_dimensions.each_with_index{|d,i|
+ if dimension_codes.include?(dimensions[i])
+ code = rdf_codes[dimension_codes.index(dimensions[i])]
+ props << <<-EOF.unindent
+ #{d} a rdf:Property, qb:DimensionProperty ;
+ rdfs:label "#{strip_prefixes(strip_uri(d))}"@en ;
+ qb:codeList #{code[1]} ;
+ rdfs:range #{code[2]} .
+
+ EOF
+ else
+ props << <<-EOF.unindent
+ #{d} a rdf:Property, qb:DimensionProperty ;
+ rdfs:label "#{strip_prefixes(strip_uri(d))}"@en .
+
+ EOF
+ end
}
props
end
def measure_properties(measures, var, options={})
options = defaults().merge(options)
+ rdf_measures = generate_resources(measures, [], [], options)[0]
props = []
- measures.map{ |m|
+ rdf_measures.map{ |m|
props << <<-EOF.unindent
- prop:#{m} a rdf:Property, qb:MeasureProperty ;
- rdfs:label "#{m}"@en .
+ #{m} a rdf:Property, qb:MeasureProperty ;
+ rdfs:label "#{strip_prefixes(strip_uri(m))}"@en .
EOF
}
props
end
def observations(measures, dimensions, codes, data, observation_labels, var, options={})
var = sanitize([var]).first
options = defaults().merge(options)
+ rdf_measures, rdf_dimensions, rdf_codes = generate_resources(measures, dimensions, codes, options)
+ data = encode_data(codes, data, var, options)
obs = []
+
+ dimension_codes = rdf_codes.map{|c|
+ if c[0]=~/^<http:/
+ c[0][1..-2]
+ else
+ c[0]
+ end
+ }
+
observation_labels.each_with_index.map{|r, i|
contains_nulls = false
str = <<-EOF.unindent
ns:obs#{r} a qb:Observation ;
qb:dataSet ns:dataset-#{var} ;
EOF
str << " rdfs:label \"#{r}\" ;\n" unless options[:no_labels]
- dimensions.map{|d|
+ dimensions.each_with_index{|d,j|
contains_nulls = contains_nulls | (data[d][i] == nil)
- if codes.include? d
- str << " prop:#{d} <code/#{d.downcase}/#{data[d][i]}> ;\n"
+
+ if dimension_codes.include? d
+ # str << " #{rdf_dimensions[j]} <code/#{d.downcase}/#{data[d][i]}> ;\n"
+ str << " #{rdf_dimensions[j]} #{to_resource(data[d][i], options)} ;\n"
else
- str << " prop:#{d} ns:#{to_resource(data[d][i], options)} ;\n"
+ str << " #{rdf_dimensions[j]} #{to_literal(data[d][i], options)} ;\n"
end
}
- measures.map{|m|
+ measures.each_with_index{|m,j|
contains_nulls = contains_nulls | (data[m][i] == nil)
- str << " prop:#{m} #{to_literal(data[m][i], options)} ;\n"
+ str << " #{rdf_measures[j]} #{to_literal(data[m][i], options)} ;\n"
}
str << " .\n\n"
- obs << str unless contains_nulls && !options[:encode_nulls]
-
+ if contains_nulls && !options[:encode_nulls]
+ if options[:raise_nils]
+ raise "missing component for observation, skipping: #{str}, "
+ elsif options[:whiny_nils]
+ puts "missing component for observation, skipping: #{str}, "
+ end
+ else
+ obs << str
+ end
}
obs
end
def code_lists(codes, data, var, options={})
options = defaults().merge(options)
+ rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], [], codes, options)
+ data = encode_data(codes, data, var, options)
lists = []
- codes.map{|code|
+ rdf_codes.map{|code|
+ if code[0] =~ /^<.+>$/
+ refcode = code[0][1..-2]
+ else
+ refcode = code[0]
+ end
str = <<-EOF.unindent
- code:#{code.downcase.capitalize} a rdfs:Class, owl:Class;
+ #{code[2]} a rdfs:Class, owl:Class;
rdfs:subClassOf skos:Concept ;
- rdfs:label "Code list for #{code} - codelist class"@en;
- rdfs:comment "Specifies the #{code} for each observation";
- rdfs:seeAlso code:#{code.downcase} .
+ rdfs:label "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist class"@en;
+ rdfs:comment "Specifies the #{strip_prefixes(strip_uri(code[1]))} for each observation";
+ rdfs:seeAlso #{code[1]} .
- code:#{code.downcase} a skos:ConceptScheme;
- skos:prefLabel "Code list for #{code} - codelist scheme"@en;
- rdfs:label "Code list for #{code} - codelist scheme"@en;
- skos:notation "CL_#{code.upcase}";
- skos:note "Specifies the #{code} for each observation";
+ #{code[1]} a skos:ConceptScheme;
+ skos:prefLabel "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist scheme"@en;
+ rdfs:label "Code list for #{strip_prefixes(strip_uri(code[1]))} - codelist scheme"@en;
+ skos:notation "CL_#{strip_prefixes(strip_uri(code[1])).upcase}";
+ skos:note "Specifies the #{strip_prefixes(strip_uri(code[1]))} for each observation";
EOF
- data[code].uniq.map{|value|
+ data[refcode].uniq.map{|value|
unless value == nil && !options[:encode_nulls]
- str << " skos:hasTopConcept <code/#{code.downcase}/#{to_resource(value,options)}> ;\n"
+ str << " skos:hasTopConcept #{to_resource(value,options)} ;\n"
end
}
str << " .\n\n"
lists << str
@@ -246,63 +332,38 @@
lists
end
def concept_codes(codes, data, var, options={})
options = defaults().merge(options)
+ rdf_measures, rdf_dimensions, rdf_codes = generate_resources([], [], codes, options)
concepts = []
- codes.map{|code|
- data[code].uniq.map{|value|
+ data = encode_data(codes, data, var, options)
+ rdf_codes.map{|code|
+ if code[0] =~ /^<.+>$/
+ refcode = code[0][1..-2]
+ else
+ refcode = code[0]
+ end
+ data[refcode].uniq.each_with_index{|value,i|
unless value == nil && !options[:encode_nulls]
concepts << <<-EOF.unindent
- <code/#{code.downcase}/#{to_resource(value,options)}> a skos:Concept, code:#{code.downcase.capitalize};
- skos:topConceptOf code:#{code.downcase} ;
- skos:prefLabel "#{to_resource(value,options)}" ;
- skos:inScheme code:#{code.downcase} .
+ #{to_resource(value,options)} a skos:Concept, #{code[2]};
+ skos:topConceptOf #{code[1]} ;
+ skos:prefLabel "#{strip_uri(data[refcode][i])}" ;
+ skos:inScheme #{code[1]} .
EOF
end
}
}
concepts
end
- def to_resource(obj, options)
- if obj.is_a? String
- #TODO decide the right way to handle missing values, since RDF has no null
- #probably throw an error here since a missing resource is a bigger problem
- obj = "NA" if obj.empty?
-
- #TODO remove special characters (faster) as well (eg '?')
- obj.gsub(' ','_').gsub('?','')
- elsif obj == nil && options[:encode_nulls]
- '"NA"'
- elsif obj.is_a? Numeric
- #resources cannot be referred to purely by integer (?)
- "n"+obj.to_s
- else
- obj
- end
- end
-
- def to_literal(obj, options)
- if obj.is_a? String
- # Depressing that there's no more elegant way to check if a string is
- # a number...
- if val = Integer(obj) rescue nil
- val
- elsif val = Float(obj) rescue nil
- val
- else
- '"'+obj+'"'
- end
- elsif obj == nil && options[:encode_nulls]
- #TODO decide the right way to handle missing values, since RDF has no null
- '"NA"'
- else
- obj
- end
+ def abbreviate_known(turtle_string)
+ #debug method
+ turtle_string.gsub(/<http:\/\/www\.rqtl\.org\/dc\/properties\/(\S+)>/, 'prop:\1').gsub(/<http:\/\/www.rqtl.org\/ns\/dc\/code\/(\S+)\/(\S+)>/, '<code/\1/\2>').gsub(/<http:\/\/www.rqtl.org\/dc\/dataset\/(\S+)\/code\/(\S+)>/, 'code:\2')
end
end
end
end