lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.19 vs lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.20
- old
+ new
@@ -22,13 +22,11 @@
sql_trait = ActiveFacts::Generators::Traits::SQL
Class.new.extend(sql_trait). # Anonymous class to enable access to traits module instance methods
options.
merge(
{
- dialect: [String, "SQL Dialect to use"],
- value_width: [Integer, "Number of characters to index from long values"],
- phonetic_confidence: [Integer, "Percentage confidence for a phonetic match"],
+ dialect: [String, "SQL Dialect to use"]
}
)
end
def initialize composition, options = {}
@@ -47,13 +45,10 @@
process_options options
end
def process_options options
- @value_width = (options.delete('value_width') || 32).to_i
- @phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
-
super
end
def generate
@all_table_unions = []
@@ -137,11 +132,11 @@
''
end
def generate_indicator leaf
nil # REVISIT: Do we need anything here?
- # select leaf.root, safe_column_name(leaf), 1, column_name(leaf), 1
+ # select(leaf.root, safe_column_name(leaf), 1, column_name(leaf))
end
# This foreign key connects two composites (tables)
def generate_joined_value member
foreign_key = member.foreign_key
@@ -229,61 +224,50 @@
def search_expr composite, intrinsic_type, col_expr, search_methods, source_field
case intrinsic_type
when MM::DataType::TYPE_Char,
MM::DataType::TYPE_String,
MM::DataType::TYPE_Text
- # Produce a truncated value with the requested search
+ # Produce a select yielding values for the requested search type
search_methods.flat_map do |sm|
case sm
when 'none' # Do not index this value
nil
when 'simple' # Disregard white-space only
- select(composite, truncate(col_expr, @value_width), 'simple', source_field, 1.0)
+ select(composite, col_expr, 'simple', source_field)
- when 'alpha', # Strip white space and punctuation, just use alphabetic characters
- 'typo' # Use trigram similarity to detect typographic errors, over the same values
- truncated = truncate(as_alpha(col_expr), @value_width)
- select(
- composite, truncated, sm, source_field,
- "CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
- )
+ when 'alpha' # Strip white space and punctuation, just use alphabetic characters
+ select(composite, as_alpha(col_expr), sm, source_field)
- when 'phonetic' # Use phonetic matching as well as trigrams
- search_expr(composite, intrinsic_type, col_expr, ['typo'], source_field) <<
- select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
+ when 'phonetic' # Use phonetic matching as well as trigrams and alpha
+ select(composite, as_alpha(col_expr), 'phonetic', source_field, phonetics(col_expr))
when 'words' # Break the text into words and match each word like alpha
- truncated = truncate(unnest(as_words(col_expr)), @value_width)
- select(composite, truncated, sm, source_field, 0.90, true)
+ select(composite, unnest(as_words(col_expr)), sm, source_field)
when 'names' # Break the text into words and match each word like phonetic
- truncated = truncate(unnest(as_words(col_expr, "''-")), @value_width) # N.B. ' is doubled for SQL
- search_expr(composite, intrinsic_type, col_expr, ['words'], source_field) <<
- phonetics(truncated).map do |phonetic|
- select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
- end
+ value = unnest(as_words(col_expr, "''-")) # N.B. ' is doubled for SQL
+ phonetic_select(value, select(composite, value, 'names', source_field))
when 'text' # Index a large text field using significant words and phrases
nil # REVISIT: Implement this type
when 'number' # Cast to number and back to text to canonicalise the value;
- # If the number doesn't match this regexp, we don't index it.
- # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
- select(composite, col_expr, 'number', source_field, number_or_null(col_expr))
+ # If it doesn't look like a number, we don't index it.
+ value = number_or_null(col_expr)
+ select(composite, value, 'number', source_field, nil, ["#{value} IS NOT NULL"])
when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
- select(composite, phone_numbers(col_expr), 'phone', source_field, 1)
+ select(composite, phone_numbers(col_expr), 'phone', source_field)
when 'email' # Use a regexp to find email addresses in this field
- select(composite, truncate(email_addresses(col_expr), @value_width), 'email', source_field, 1)
+ select(composite, email_addresses(col_expr), 'email', source_field)
- when 'date' # Convert string to standard date format if it looks like a date, NULL otherwise
- select(
- composite, col_expr, 'date', source_field, 1,
- %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{col_expr}::numeric):text ELSE NULL END}
- )
+ when 'date' # REVISIT: Convert string to standard date format
+ # If it doesn't look like a date, we don't index it.
+ value = date_or_null(col_expr)
+ select(composite, value, 'date', source_field, nil, ["#{value} IS NOT NULL"])
else
$stderrs.puts "Unknown search method #{sm}"
end
end
@@ -293,28 +277,26 @@
when MM::DataType::TYPE_Integer,
MM::DataType::TYPE_Real,
MM::DataType::TYPE_Decimal,
MM::DataType::TYPE_Money
- # Produce a right-justified value
- # REVISIT: This is a dumb thing to do.
- select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
+ select(composite, col_expr, 'simple', source_field)
when MM::DataType::TYPE_Date
# Produce an ISO representation that sorts lexically (YYYY-MM-DD)
# REVISIT: Support search methods here
- select(composite, lexical_date(col_expr), 'simple', source_field, 1)
+ select(composite, lexical_date(col_expr), 'date', source_field)
when MM::DataType::TYPE_DateTime,
MM::DataType::TYPE_Timestamp
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
# REVISIT: Support search methods here
- select(composite, lexical_datetime(col_expr), 'simple', source_field, 1)
+ select(composite, lexical_datetime(col_expr), 'datetime', source_field)
when MM::DataType::TYPE_Time
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
- select(composite, lexical_time(col_expr), 'simple', source_field, 1)
+ select(composite, lexical_time(col_expr), 'time', source_field)
when MM::DataType::TYPE_Binary
nil # No indexing applied
when nil # Data Type is unknown
else
@@ -323,38 +305,66 @@
def stylise_column_name name
name.words.send(@column_case)*@column_joiner
end
- def select composite, expression, processing, source_field, confidence = 1, distinct = false, where = []
+ def field_names
+ @field_names ||=
+ %w{Value Phonetic Processing SourceTable SourceField LoadBatchID RecordGUID}.
+ map{|n| stylise_column_name(n)}
+ end
+
+ def phonetic_select expression, select
+ field_list =
+ field_names.
+ map do |n|
+ if n =~ /Phonetic/i
+ phonetics(Expression.new(stylise_column_name('Value'), MM::DataType::TYPE_String, true)).to_s + " AS #{n}"
+ else
+ n
+ end
+ end.
+ join(",\n\t")
+
+ %Q{
+ SELECT DISTINCT
+ <FIELDS>
+ FROM (<SUB>
+ ) AS s}.
+ unindent.
+ sub(/<FIELDS>/, field_list).
+ sub(/<SUB>/, select.gsub(/\n/,"\n\t"))
+ end
+
+ def select composite, expression, processing, source_field, phonetic = nil, conditions = []
# These fields are in order of index precedence, to co-locate
# comparable values regardless of source record type or column
- where << 'Value IS NOT NULL' if expression.to_s =~ /\bNULL\b/
- processing_name = stylise_column_name("Processing")
- value_name = stylise_column_name("Value")
- load_batch_id_name = stylise_column_name("LoadBatchID")
- record_guid_name = stylise_column_name("RecordGUID")
- confidence_name = stylise_column_name("Confidence")
- source_table_name = stylise_column_name("SourceTable")
- source_field_name = stylise_column_name("SourceField")
- expression_text = expression.to_s
+
+ select_list =
+ [ expression.to_s,
+ phonetic ? phonetic.to_s : 'NULL',
+ "'"+processing+"'::text",
+ "'"+safe_table_name(composite)+"'::text",
+ "'"+source_field+"'::text",
+ nil,
+ nil,
+ ].zip(field_names).
+ map(&:compact).
+ map{|a| a * ' AS '}.
+ join(%q{,
+ })
+ where =
+ if conditions.empty?
+ ''
+ else
+ "\nWHERE\t#{conditions*"\n AND\t"}"
+ end
select = %Q{
- SELECT#{distinct ? ' DISTINCT' : ''}
- '#{processing}' AS #{processing_name},
- #{expression_text} AS #{value_name},
- #{load_batch_id_name},
- #{confidence} AS #{confidence_name},
- #{record_guid_name},
- '#{safe_table_name(composite)}' AS #{source_table_name},
- '#{source_field}' AS #{source_field_name}
+ SELECT DISTINCT
+ #{select_list}
FROM #{safe_table_name(composite)}}.
- unindent
-
- if where.empty?
- select
- else
- "\nSELECT * FROM (#{select}\n) AS s WHERE #{where*' AND '}"
- end
+ unindent+
+ where
end
end
end