lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.18 vs lib/activefacts/generator/etl/unidex.rb in activefacts-compositions-1.9.19

- old
+ new

@@ -48,11 +48,11 @@ process_options options end def process_options options @value_width = (options.delete('value_width') || 32).to_i - @phonetic_confidence = (options.delete('phonetic_confidence') || 70).to_i + @phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i super end def generate @@ -234,36 +234,59 @@ # Produce a truncated value with the requested search search_methods.flat_map do |sm| case sm when 'none' # Do not index this value nil + when 'simple' # Disregard white-space only select(composite, truncate(col_expr, @value_width), 'simple', source_field, 1.0) - when 'alpha' # Strip white space and punctuation, just use alphabetic characters - select(composite, truncate(as_alpha(col_expr), @value_width), 'alpha', source_field, 0.9) + when 'alpha', # Strip white space and punctuation, just use alphabetic characters + 'typo' # Use trigram similarity to detect typographic errors, over the same values + truncated = truncate(as_alpha(col_expr), @value_width) + select( + composite, truncated, sm, source_field, + "CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match. + ) + when 'phonetic' # Use phonetic matching as well as trigrams + search_expr(composite, intrinsic_type, col_expr, ['typo'], source_field) << + select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true) + when 'words' # Break the text into words and match each word like alpha - nil # REVISIT: Implement this type + truncated = truncate(unnest(as_words(col_expr)), @value_width) + select(composite, truncated, sm, source_field, 0.90, true) - # when 'phrases' # Words, but where adjacent sequences of words matter - when 'typo' # Use trigram similarity to detect typographic errors - # REVISIT: Implement this type properly - select(composite, trigram(as_alpha(col_expr)), 'typo', source_field, 0.9) - - when 'phonetic' # Use phonetic matching as well as trigrams - phonetics(col_expr).map do |p| - select(composite, p, 'phonetic', source_field, @phonetic_confidence/100.0) + when 'names' # Break the text into words and match each word like phonetic + truncated = truncate(unnest(as_words(col_expr, "''-")), @value_width) # N.B. ' is doubled for SQL + search_expr(composite, intrinsic_type, col_expr, ['words'], source_field) << + phonetics(truncated).map do |phonetic| + select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true) end - when 'names' # Break the text into words and match each word like phonetic - nil # REVISIT: Implement this type - when 'text' # Index a large text field using significant words and phrases nil # REVISIT: Implement this type + + when 'number' # Cast to number and back to text to canonicalise the value; + # If the number doesn't match this regexp, we don't index it. + # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4) + select(composite, col_expr, 'number', source_field, number_or_null(col_expr)) + + when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each + select(composite, phone_numbers(col_expr), 'phone', source_field, 1) + + when 'email' # Use a regexp to find email addresses in this field + select(composite, truncate(email_addresses(col_expr), @value_width), 'email', source_field, 1) + + when 'date' # Convert string to standard date format if it looks like a date, NULL otherwise + select( + composite, col_expr, 'date', source_field, 1, + %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{col_expr}::numeric):text ELSE NULL END} + ) + else - raise "Unknown search method #{sm}" + $stderrs.puts "Unknown search method #{sm}" end end when MM::DataType::TYPE_Boolean nil # REVISIT: Implement this type @@ -271,10 +294,11 @@ when MM::DataType::TYPE_Integer, MM::DataType::TYPE_Real, MM::DataType::TYPE_Decimal, MM::DataType::TYPE_Money # Produce a right-justified value + # REVISIT: This is a dumb thing to do. select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1) when MM::DataType::TYPE_Date # Produce an ISO representation that sorts lexically (YYYY-MM-DD) # REVISIT: Support search methods here @@ -299,11 +323,11 @@ def stylise_column_name name name.words.send(@column_case)*@column_joiner end - def select composite, expression, processing, source_field, confidence = 1, where = [] + def select composite, expression, processing, source_field, confidence = 1, distinct = false, where = [] # These fields are in order of index precedence, to co-locate # comparable values regardless of source record type or column where << 'Value IS NOT NULL' if expression.to_s =~ /\bNULL\b/ processing_name = stylise_column_name("Processing") value_name = stylise_column_name("Value") @@ -311,20 +335,19 @@ record_guid_name = stylise_column_name("RecordGUID") confidence_name = stylise_column_name("Confidence") source_table_name = stylise_column_name("SourceTable") source_field_name = stylise_column_name("SourceField") expression_text = expression.to_s - expression_text = "ARRAY[#{expression_text}]" unless expression.is_array select = %Q{ - SELECT '#{processing}' AS #{processing_name}, + SELECT#{distinct ? ' DISTINCT' : ''} + '#{processing}' AS #{processing_name}, #{expression_text} AS #{value_name}, #{load_batch_id_name}, - #{"%.2f" % confidence} AS #{confidence_name}, + #{confidence} AS #{confidence_name}, #{record_guid_name}, '#{safe_table_name(composite)}' AS #{source_table_name}, '#{source_field}' AS #{source_field_name} - FROM #{safe_table_name(composite)} - WHERE COALESCE(#{expression},'') <> ''}. + FROM #{safe_table_name(composite)}}. unindent if where.empty? select else