lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.18 vs lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.19
- old
+ new
@@ -48,11 +48,11 @@
process_options options
end
def process_options options
@value_width = (options.delete('value_width') || 32).to_i
- @phonetic_confidence = (options.delete('phonetic_confidence') || 70).to_i
+ @phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
super
end
def generate
@@ -234,36 +234,59 @@
# Produce a truncated value with the requested search
search_methods.flat_map do |sm|
case sm
when 'none' # Do not index this value
nil
+
when 'simple' # Disregard white-space only
select(composite, truncate(col_expr, @value_width), 'simple', source_field, 1.0)
- when 'alpha' # Strip white space and punctuation, just use alphabetic characters
- select(composite, truncate(as_alpha(col_expr), @value_width), 'alpha', source_field, 0.9)
+ when 'alpha', # Strip white space and punctuation, just use alphabetic characters
+ 'typo' # Use trigram similarity to detect typographic errors, over the same values
+ truncated = truncate(as_alpha(col_expr), @value_width)
+ select(
+ composite, truncated, sm, source_field,
+ "CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
+ )
+ when 'phonetic' # Use phonetic matching as well as trigrams
+ search_expr(composite, intrinsic_type, col_expr, ['typo'], source_field) <<
+ select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
+
when 'words' # Break the text into words and match each word like alpha
- nil # REVISIT: Implement this type
+ truncated = truncate(unnest(as_words(col_expr)), @value_width)
+ select(composite, truncated, sm, source_field, 0.90, true)
- # when 'phrases' # Words, but where adjacent sequences of words matter
- when 'typo' # Use trigram similarity to detect typographic errors
- # REVISIT: Implement this type properly
- select(composite, trigram(as_alpha(col_expr)), 'typo', source_field, 0.9)
-
- when 'phonetic' # Use phonetic matching as well as trigrams
- phonetics(col_expr).map do |p|
- select(composite, p, 'phonetic', source_field, @phonetic_confidence/100.0)
+ when 'names' # Break the text into words and match each word like phonetic
+ truncated = truncate(unnest(as_words(col_expr, "''-")), @value_width) # N.B. ' is doubled for SQL
+ search_expr(composite, intrinsic_type, col_expr, ['words'], source_field) <<
+ phonetics(truncated).map do |phonetic|
+ select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
end
- when 'names' # Break the text into words and match each word like phonetic
- nil # REVISIT: Implement this type
-
when 'text' # Index a large text field using significant words and phrases
nil # REVISIT: Implement this type
+
+ when 'number' # Cast to number and back to text to canonicalise the value;
+ # If the number doesn't match this regexp, we don't index it.
+ # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
+ select(composite, col_expr, 'number', source_field, number_or_null(col_expr))
+
+ when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
+ select(composite, phone_numbers(col_expr), 'phone', source_field, 1)
+
+ when 'email' # Use a regexp to find email addresses in this field
+ select(composite, truncate(email_addresses(col_expr), @value_width), 'email', source_field, 1)
+
+ when 'date' # Convert string to standard date format if it looks like a date, NULL otherwise
+ select(
+ composite, col_expr, 'date', source_field, 1,
+ %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{col_expr}::numeric):text ELSE NULL END}
+ )
+
else
- raise "Unknown search method #{sm}"
+ $stderrs.puts "Unknown search method #{sm}"
end
end
when MM::DataType::TYPE_Boolean
nil # REVISIT: Implement this type
@@ -271,10 +294,11 @@
when MM::DataType::TYPE_Integer,
MM::DataType::TYPE_Real,
MM::DataType::TYPE_Decimal,
MM::DataType::TYPE_Money
# Produce a right-justified value
+ # REVISIT: This is a dumb thing to do.
select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
when MM::DataType::TYPE_Date
# Produce an ISO representation that sorts lexically (YYYY-MM-DD)
# REVISIT: Support search methods here
@@ -299,11 +323,11 @@
def stylise_column_name name
name.words.send(@column_case)*@column_joiner
end
- def select composite, expression, processing, source_field, confidence = 1, where = []
+ def select composite, expression, processing, source_field, confidence = 1, distinct = false, where = []
# These fields are in order of index precedence, to co-locate
# comparable values regardless of source record type or column
where << 'Value IS NOT NULL' if expression.to_s =~ /\bNULL\b/
processing_name = stylise_column_name("Processing")
value_name = stylise_column_name("Value")
@@ -311,20 +335,19 @@
record_guid_name = stylise_column_name("RecordGUID")
confidence_name = stylise_column_name("Confidence")
source_table_name = stylise_column_name("SourceTable")
source_field_name = stylise_column_name("SourceField")
expression_text = expression.to_s
- expression_text = "ARRAY[#{expression_text}]" unless expression.is_array
select = %Q{
- SELECT '#{processing}' AS #{processing_name},
+ SELECT#{distinct ? ' DISTINCT' : ''}
+ '#{processing}' AS #{processing_name},
#{expression_text} AS #{value_name},
#{load_batch_id_name},
- #{"%.2f" % confidence} AS #{confidence_name},
+ #{confidence} AS #{confidence_name},
#{record_guid_name},
'#{safe_table_name(composite)}' AS #{source_table_name},
'#{source_field}' AS #{source_field_name}
- FROM #{safe_table_name(composite)}
- WHERE COALESCE(#{expression},'') <> ''}.
+ FROM #{safe_table_name(composite)}}.
unindent
if where.empty?
select
else