# Author:: Eyal Oren # Copyright:: (c) 2005-2006 Eyal Oren # License:: LGPL require 'sqlite3' require 'active_rdf' require 'federation/connection_pool' require 'uuidtools' require 'queryengine/ntriples_parser' require 'open-uri' require 'mime/types' ActiveRdfLogger::log_info "Loading RDFLite adapter", self # RDFLite is a lightweight RDF database on top of sqlite3. It can act as adapter # in ActiveRDF. It supports on-disk and in-memory usage, and allows keyword # search if ferret is installed. class RDFLite < ActiveRdfAdapter class << self begin require 'ferret' @have_ferret = true rescue LoadError ActiveRdfLogger::log_info "Keyword search is disabled since we could not load Ferret. To enable, please do \"gem install ferret\"", self @have_ferret = false end def has_ferret? @have_ferret end end ConnectionPool.register_adapter(:rdflite,self) bool_accessor :keyword_search, :reasoning # instantiates RDFLite database # available parameters: # * :location => filepath (defaults to memory) # * :keyword => true/false (defaults to false) # * :pidx, :oidx, etc. => true/false (enable/disable these indices) def initialize(params = {}) super() ActiveRdfLogger::log_info(self) { "Initialised rdflite with params #{params.to_s}" } @reads = true @writes = true # if no file-location given, we use in-memory store file = params[:location] || ':memory:' @db = SQLite3::Database.new(file) # disable keyword search by default, enable only if ferret is found @keyword_search = params[:keyword].nil? ? false : params[:keyword] @keyword_search &= self.class.has_ferret? @reasoning = params[:reasoning] || false @subprops = {} if @reasoning if keyword_search? # we initialise the ferret index, either as a file or in memory infos = Ferret::Index::FieldInfos.new # we setup the fields not to store object's contents infos.add_field(:subject, :store => :yes, :index => :no, :term_vector => :no) infos.add_field(:object, :store => :no) #, :index => :omit_norms) @ferret = if params[:location] Ferret::I.new(:path => params[:location] + '.ferret', :field_infos => infos) else Ferret::I.new(:field_infos => infos) end end # turn off filesystem synchronisation for speed @db.synchronous = 'off' # create triples table. ignores duplicated triples @db.execute('create table if not exists triple(s,p,o,c, unique(s,p,o,c) on conflict ignore)') create_indices(params) @db end # returns the number of triples in the datastore (incl. possible duplicates) def size @db.execute('select count(*) from triple')[0][0].to_i end # returns all triples in the datastore def dump @db.execute('select s,p,o,c from triple').collect do |s,p,o,c| [s,p,o,c].join(' ') end end # deletes all triples from datastore def clear @db.execute('delete from triple') end # close adapter and remove it from the ConnectionPool def close ConnectionPool.remove_data_source(self) @db.close end # deletes triple(s,p,o,c) from datastore # symbol parameters match anything: delete(:s,:p,:o) will delete all triples # you can specify a context to limit deletion to that context: # delete(:s,:p,:o, 'http://context') will delete all triples with that context def delete(s, p, o, c=nil) # convert non-nil input to internal format quad = [s,p,o,c].collect {|r| r.nil? ? nil : internalise(r) } # construct where clause for deletion (for all non-nil input) where_clauses = [] conditions = [] quad.each_with_index do |r,i| unless r.nil? conditions << r where_clauses << "#{SPOC[i]} = ?" end end # construct delete string ds = 'delete from triple' ds << " where #{where_clauses.join(' and ')}" unless where_clauses.empty? # execute delete string with possible deletion conditions (for each # non-empty where clause) ActiveRdfLogger::log_debug(self) { "Deleting #{[s,p,o,c].join(' ')}" } @db.execute(ds, *conditions) # delete literal from ferret index @ferret.search_each("subject:\"#{s}\", object:\"#{o}\"") do |idx, score| @ferret.delete(idx) end if keyword_search? @db end # adds triple(s,p,o) to datastore # s,p must be resources, o can be primitive data or resource def add(s,p,o,c=nil) # check illegal input raise(ActiveRdfError, "adding non-resource #{s} while adding (#{s},#{p},#{o},#{c})") unless s.respond_to?(:uri) raise(ActiveRdfError, "adding non-resource #{p} while adding (#{s},#{p},#{o},#{c})") unless p.respond_to?(:uri) triple = [s, p, o].collect{|r| serialise(r) } add_ntriples(triple.join(' ') + " .\n", serialise(c) ) ## get internal representation (array) #quad = [s,p,o,c].collect {|r| internalise(r) } ## insert the triple into the datastore #@db.execute('insert into triple values (?,?,?,?)', *quad) ## if keyword-search available, insert the object into keyword search #@ferret << {:subject => s, :object => o} if keyword_search? end # flushes openstanding changes to underlying sqlite3 def flush # since we always write changes into sqlite3 immediately, we don't do # anything here true end # loads triples from file in ntriples format def load(location) context = if URI.parse(location).host location else internalise(RDFS::Resource.new("file:#{location}")) end case MIME::Types.of(location) when MIME::Types['application/rdf+xml'] # check if rapper available begin # can only parse rdf/xml with redland # die otherwise require 'rdf/redland' model = Redland::Model.new Redland::Parser.new.parse_into_model(model, location) add_ntriples(model.to_string('ntriples'), location) rescue LoadError raise ActiveRdfError, "cannot parse remote rdf/xml file without Redland: please install Redland (librdf.org) and its Ruby bindings" end else data = open(location).read add_ntriples(data, context) end end # adds ntriples from given context into datastore def add_ntriples(ntriples, context) # add each triple to db @db.transaction insert = @db.prepare('insert into triple values (?,?,?,?);') ntriples = NTriplesParser.parse(ntriples) ntriples.each do |s,p,o| # convert triples into internal db format subject, predicate, object = [s,p,o].collect {|r| internalise(r) } # insert triple into database insert.execute(subject, predicate, object, context) # if keyword-search available, insert the object into keyword search @ferret << {:subject => subject, :object => object} if keyword_search? end @db.commit @db end # executes ActiveRDF query on datastore def query(query) # construct query clauses sql, conditions = translate(query) # executing query, passing all where-clause values as parameters (so that # sqlite will encode quotes correctly) results = @db.execute(sql, *conditions) # if ASK query, we check whether we received a positive result count if query.ask? return [[results[0][0].to_i > 0]] elsif query.count? return [[results[0][0].to_i]] else # otherwise we convert results to ActiveRDF nodes and return them return wrap(results, query.resource_class) end end # translates ActiveRDF query into internal sqlite query string def translate(query) where, conditions = construct_where(query) [construct_select(query) + construct_join(query) + where + construct_sort(query) + construct_limit(query), conditions] end private # constants for extracting resources/literals from sql results SPOC = ['s','p','o','c'] # construct select clause def construct_select(query) # ASK queries counts the results, and return true if results > 0 return "select count(*)" if query.ask? # add select terms for each selectclause in the query # the term names depend on the join conditions, e.g. t0.s or t1.p select = query.select_clauses.collect do |term| variable_name(query, term) end # add possible distinct and count functions to select clause select_clause = '' select_clause << 'distinct ' if query.distinct? select_clause << select.join(', ') select_clause = "count(#{select_clause})" if query.count? "select " + select_clause end # construct (optional) limit and offset clauses def construct_limit(query) clause = "" # if no limit given, use limit -1 (no limit) limit = query.limits.nil? ? -1 : query.limits # if no offset given, use offset 0 offset = query.offsets.nil? ? 0 : query.offsets clause << " limit #{limit} offset #{offset}" clause end # sort query results on variable clause (optionally) def construct_sort(query) if not query.sort_clauses.empty? sort = query.sort_clauses.collect { |term| variable_name(query, term) } " order by (#{sort.join(',')})" elsif not query.reverse_sort_clauses.empty? sort = query.reverse_sort_clauses.collect { |term| variable_name(query, term) } " order by (#{sort.join(',')}) DESC" else "" end end # construct join clause # TODO: joins don't work this way, they have to be linear (in one direction # only, and we should only alias tables we didnt alias yet) # we should only look for one join clause in each where-clause: when we find # one, we skip the rest of the variables in this clause. def construct_join(query) join_stmt = '' # no join necessary if only one where clause given return ' from triple as t0 ' if query.where_clauses.size == 1 where_clauses = query.where_clauses.flatten considering = where_clauses.uniq.select{|w| w.is_a?(Symbol)} # constructing hash with indices for all terms # e.g. {?s => [1,3,5], ?p => [2], ... } term_occurrences = Hash.new() where_clauses.each_with_index do |term, index| ary = (term_occurrences[term] ||= []) ary << index end aliases = {} where_clauses.each_with_index do |term, index| # if the term has been joined with his buddy already, we can skip it next unless considering.include?(term) # we find all (other) occurrences of this term indices = term_occurrences[term] # if the term doesnt have a join-buddy, we can skip it next if indices.size == 1 # construct t0,t1,... as aliases for term # and construct join condition, e.g. t0.s termalias = "t#{index / 4}" termjoin = "#{termalias}.#{SPOC[index % 4]}" join = if join_stmt.include?(termalias) "" else "triple as #{termalias}" end indices.each do |i| # skip the current term itself next if i==index # construct t0,t1, etc. as aliases for buddy, # and construct join condition, e.g. t0.s = t1.p buddyalias = "t#{i/4}" buddyjoin = "#{buddyalias}.#{SPOC[i%4]}" # TODO: fix reuse of same table names as aliases, e.g. # "from triple as t1 join triple as t2 on ... join t1 on ..." # is not allowed as such by sqlite # but on the other hand, restating the aliases gives ambiguity: # "from triple as t1 join triple as t2 on ... join triple as t1 ..." # is ambiguous if join_stmt.include?(buddyalias) join << "and #{termjoin} = #{buddyjoin}" else join << " join triple as #{buddyalias} on #{termjoin} = #{buddyjoin} " end end join_stmt << join # remove term from 'todo' list of still-considered terms considering.delete(term) end if join_stmt == '' return " from triple as t0 " else return " from #{join_stmt} " end end # construct where clause def construct_where(query) # collecting where clauses, these will be added to the sql string later where = [] # collecting all the right-hand sides of where clauses (e.g. where name = # 'abc'), to add to query string later using ?-notation, because then # sqlite will automatically encode quoted literals correctly right_hand_sides = [] # convert each where clause to SQL: # add where clause for each subclause, except if it's a variable query.where_clauses.each_with_index do |clause,level| raise ActiveRdfError, "where clause #{clause} is not a triple" unless clause.is_a?(Array) clause.each_with_index do |subclause, i| # dont add where clause for variables unless subclause.is_a?(Symbol) || subclause.nil? conditions = compute_where_condition(i, subclause, query.reasoning? && reasoning?) if conditions.size == 1 where << "t#{level}.#{SPOC[i]} = ?" right_hand_sides << conditions.first else conditions = conditions.collect {|c| "'#{c}'"} where << "t#{level}.#{SPOC[i]} in (#{conditions.join(',')})" end end end end # if keyword clause given, convert it using keyword index if query.keyword? && keyword_search? subjects = [] select_subject = query.keywords.collect {|subj,key| subj}.uniq raise ActiveRdfError, "cannot do keyword search over multiple subjects" if select_subject.size > 1 keywords = query.keywords.collect {|subj,key| key} @ferret.search_each("object:#{keywords}") do |idx,score| subjects << @ferret[idx][:subject] end subjects.uniq! if query.distinct? where << "#{variable_name(query,select_subject.first)} in (#{subjects.collect {'?'}.join(',')})" right_hand_sides += subjects end if where.empty? ['',[]] else ["where " + where.join(' and '), right_hand_sides] end end def compute_where_condition(index, subclause, reasoning) conditions = [subclause] # expand conditions with rdfs rules if reasoning enabled if reasoning case index when 0: ; # no rule for subjects when 1: # expand properties to include all subproperties conditions = subproperties(subclause) if subclause.respond_to?(:uri) when 2: # no rule for objects when 3: # no rule for contexts end end # convert conditions into internal format #conditions.collect { |c| c.respond_to?(:uri) ? "<#{c.uri}>" : c.to_s } conditions.collect { |c| internalise(c) } end def subproperties(resource) # compute and store subproperties of this resource # or use earlier computed value if available unless @subprops[resource] subproperty = Namespace.lookup(:rdfs,:subPropertyOf) children_query = Query.new.distinct(:sub).where(:sub, subproperty, resource) children_query.reasoning = false children = children_query.execute if children.empty? @subprops[resource] = [resource] else @subprops[resource] = [resource] + children.collect{|c| subproperties(c)}.flatten.compact end end @subprops[resource] end # returns sql variable name for a queryterm def variable_name(query,term) # look up the first occurence of this term in the where clauses, and compute # the level and s/p/o position of it index = query.where_clauses.flatten.index(term) if index.nil? # term does not appear in where clause # but maybe it appears in a keyword clause # index would not be nil if we had: # select(:o).where(knud, knows, :o).where(:o, :keyword, 'eyal') # # the only possibility that index is nil is if we have: # select(:o).where(:o, :keyword, :eyal) (selecting subject) # or if we use a select clause that does not appear in any where clause # so we check if we find the term in the keyword clauses, otherwise we throw # an error if query.keywords.keys.include?(term) return "t0.s" else raise ActiveRdfError, "unbound variable :#{term.to_s} in select of #{query.to_sp}" end end termtable = "t#{index / 4}" termspo = SPOC[index % 4] return "#{termtable}.#{termspo}" end # wrap resources into ActiveRDF resources, literals into Strings. result_type # is the type that should be used for ActiveRDF resources def wrap(results, result_type) results.collect do |row| row.collect { |result| parse(result, result_type) } end end # Return the result as a correct type. result_type is the type that should # be used for "resouce" elements def parse(result, result_type = RDFS::Resource) NTriplesParser.parse_node(result, result_type) || result # case result # when Literal # # replace special characters to allow string interpolation for e.g. 'test\nbreak' # $1.double_quote # when Resource # result_type.new($1) # else # # when we do a count(*) query we get a number, not a resource/literal # result # end end def create_indices(params) sidx = params[:sidx] || false pidx = params[:pidx] || false oidx = params[:oidx] || false spidx = params[:spidx] || true soidx = params[:soidx] || false poidx = params[:poidx] || true opidx = params[:opidx] || false # creating lookup indices @db.transaction do @db.execute('create index if not exists sidx on triple(s)') if sidx @db.execute('create index if not exists pidx on triple(p)') if pidx @db.execute('create index if not exists oidx on triple(o)') if oidx @db.execute('create index if not exists spidx on triple(s,p)') if spidx @db.execute('create index if not exists soidx on triple(s,o)') if soidx @db.execute('create index if not exists poidx on triple(p,o)') if poidx @db.execute('create index if not exists opidx on triple(o,p)') if opidx end end # transform triple into internal format and "literal" def internalise(r) if r.nil? or r.is_a? Symbol nil else r.to_ntriple end end # transform resource/literal into ntriples format def serialise(r) # Fixme: r is most likely NOT a resource if r.nil? nil else r.to_ntriple end end Resource = /<([^>]*)>/ Literal = /"((?:\\"|[^"])*)"/ public :subproperties end class String def double_quote Thread.new do $SAFE = 12 begin eval('"%s"' % self) rescue Exception => e self end end.value end end