# frozen_string_literal: true # Copyright 2018 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. require "google/bigtable/v2/data_pb" require "google/cloud/bigtable/value_range" require "google/cloud/bigtable/column_range" require "google/cloud/bigtable/row_filter/simple_filter" require "google/cloud/bigtable/row_filter/chain_filter" require "google/cloud/bigtable/row_filter/interleave_filter" require "google/cloud/bigtable/row_filter/condition_filter" module Google module Cloud module Bigtable ## # # RowFilter # # Takes a row as input and produces an alternate view of the row based on # specified rules. For example, a RowFilter might trim down a row to include # just the cells from columns matching a given regular expression, or it might # return all the cells of a row but not their values. More complicated filters # can be composed out of these components to express requests such as, "within # every column of a particular family, give just the two most recent cells # that are older than timestamp X." # # Two broad categories of RowFilters are `true filters` and `transformers`. # Two ways to compose simple filters into more complex ones are # `chains` and `interleaves`. They work as follows: # # * True filters alter the input row by excluding some of its cells wholesale # from the output row. An example of a true filter is the `value_regex_filter`, # which excludes cells whose values don't match the specified pattern. All # regex true filters use RE2 syntax (https:#github.com/google/re2/wiki/Syntax) # in raw byte mode (RE2::Latin1) and are evaluated as full matches. An # important point to keep in mind is that `RE2(.)` is equivalent by default to # `RE2([^\n])`, meaning that it does not match newlines. When attempting to # match an arbitrary byte, you should therefore use the escape sequence `\C`, # which should be further escaped as `\\C` in Ruby. # # * Transformers alter the input row by changing the values of some of its # cells in the output, without excluding them completely. Currently, the only # supported transformer is the `strip_value_transformer`, which replaces every # cell's value with an empty string. # # * Chains and interleaves are described in more detail in the # RowFilter.Chain and RowFilter.Interleave documentation. # # The total serialized size of a RowFilter message must not # exceed 4096 bytes, and RowFilters may not be nested within each other # (in chains or interleaves) to a depth of more than 20. # # ADVANCED USE:. # Hook for introspection into the RowFilter. Outputs all cells directly to # the output of the read rather than to any parent filter. Consider the # following example: # # Chain( # FamilyRegex("A"), # Interleave( # All(), # Chain(Label("foo"), Sink()) # ), # QualifierRegex("B") # ) # # A,A,1,w # A,B,2,x # B,B,4,z # | # FamilyRegex("A") # | # A,A,1,w # A,B,2,x # | # +------------+-------------+ # | | # All() Label(foo) # | | # A,A,1,w A,A,1,w,labels:[foo] # A,B,2,x A,B,2,x,labels:[foo] # | | # | Sink() --------------+ # | | | # +------------+ x------+ A,A,1,w,labels:[foo] # | A,B,2,x,labels:[foo] # A,A,1,w | # A,B,2,x | # | | # QualifierRegex("B") | # | | # A,B,2,x | # | | # +--------------------------------+ # | # A,A,1,w,labels:[foo] # A,B,2,x,labels:[foo] # could be switched # A,B,2,x # could be switched # # Despite being excluded by the qualifier filter, a copy of every cell # that reaches the sink is present in the final result. # # As with an interleave filter, duplicate cells are possible # and appear in an unspecified mutual order. # In this case we have a duplicate with column "A:B" and timestamp 2 # because one copy passed through the All filter while the other was # passed through the Label and Sink filters. Note that one copy has the label "foo", # while the other does not. # # @example # # # Pass filter # Google::Cloud::Bigtable::RowFilter.pass # # # Key regex filter # Google::Cloud::Bigtable::RowFilter.key("user-*") # # # Cell limit filter # Google::Cloud::Bigtable::RowFilter.cells_per_row(10) # module RowFilter # @private PASS = SimpleFilter.new.pass.freeze # @private BLOCK = SimpleFilter.new.block.freeze # @private SINK = SimpleFilter.new.sink.freeze # @private STRIP_VALUE = SimpleFilter.new.strip_value.freeze private_constant :PASS, :BLOCK, :SINK, :STRIP_VALUE ## # Creates a chain filter instance. # # A chain RowFilter that sends rows through several RowFilters in sequence. # # See {Google::Cloud::Bigtable::RowFilter::ChainFilter} # # The elements of "filters" are chained together to process the input row: # in row -> f(0) -> intermediate row -> f(1) -> ... -> f(N) -> out row # The full chain is executed atomically. # # @return [Google::Cloud::Bigtable::RowFilter::ChainFilter] # # @example Create chain filter with simple filter. # # chain = Google::Cloud::Bigtable::RowFilter.chain # # # Add filters to chain filter # chain.key("user-*") # chain.strip_value # # # OR # chain.key("user-*").strip_value # # @example Create complex chain filter. # # chain = Google::Cloud::Bigtable::RowFilter.chain # # chain_1 = Google::Cloud::Bigtable::RowFilter.chain # chain_1.label("users").qualifier("name").cells_per_row(5) # # # Add to main chain filter # chain.chain(chain_1).value("xyz*").key("user-*") # def self.chain ChainFilter.new end ## # Creates an interleave filter. # # A RowFilter that sends each row to each of several component # RowFilters and interleaves the results. # # The elements of "filters" all process a copy of the input row, and the # results are pooled, sorted, and combined into a single output row. # If multiple cells are produced with the same column and timestamp, # they will all appear in the output row in an unspecified mutual order. # Consider the following example, with three filters: # # input row # | # ----------------------------------------------------- # | | | # f(0) f(1) f(2) # | | | # 1: foo,bar,10,x foo,bar,10,z far,bar,7,a # 2: foo,blah,11,z far,blah,5,x far,blah,5,x # | | | # ----------------------------------------------------- # | # 1: foo,bar,10,z # could have switched with #2 # 2: foo,bar,10,x # could have switched with #1 # 3: foo,blah,11,z # 4: far,bar,7,a # 5: far,blah,5,x # identical to #6 # 6: far,blah,5,x # identical to #5 # # All interleaved filters are executed atomically. # # @return [Google::Cloud::Bigtable::RowFilter::InterleaveFilter] # # @example Create an interleave filter with simple filter. # # interleave = Google::Cloud::Bigtable::RowFilter.interleave # # # Add filters to interleave filter # interleave.key("user-*") # interleave.sink # # # OR # interleave.key("user-*").sink # # @example Create complex interleave filter. # # interleave = Google::Cloud::Bigtable::RowFilter.interleave # # chain_1 = Google::Cloud::Bigtable::RowFilter.chain # chain_1.label("users").qualifier("name").cells_per_row(5) # # # Add to main chain filter # interleave.chain(chain_1).value("xyz*").key("user-*") # def self.interleave InterleaveFilter.new end ## # Creates a condition filter instance. # # A RowFilter that evaluates one of two possible RowFilters, depending on # whether or not a predicate RowFilter outputs any cells from the input row. # # IMPORTANT NOTE: The predicate filter does not execute atomically with the # true and false filters, which may lead to inconsistent or unexpected # results. Additionally, condition filters have poor performance, especially # when filters are set for the false condition. # # Cannot be used within the `predicate_filter`, `true_filter`, or `false_filter` # # @param predicate [SimpleFilter, ChainFilter, InterleaveFilter, ConditionFilter] # @return [Google::Cloud::Bigtable::RowFilter::ConditionFilter] # # @example # # predicate = Google::Cloud::Bigtable::RowFilter.key("user-*") # condition = Google::Cloud::Bigtable::RowFilter.condition(predicate) # # label = Google::Cloud::Bigtable::RowFilter.label("user") # strip_value = Google::Cloud::Bigtable::RowFilter.strip_value # # # On match apply label, else strip cell values # condition.on_match(label).otherwise(strip_value) # def self.condition predicate ConditionFilter.new predicate end ## # Creates a pass filter instance. # # Matches all cells, regardless of input. Functionally equivalent to # leaving `filter` unset, but included for completeness. # # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.pass # def self.pass PASS end ## # Creates a block-all filter instance. # # Does not match any cells, regardless of input. Useful for temporarily # disabling just part of a filter. # # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.block # def self.block BLOCK end ## # Creates a sink filter instance. # # Outputs all cells directly to the output of the read rather than to any # parent filter # # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.sink # def self.sink SINK end ## # Creates a strip value filter instance. # # Replaces each cell's value with an empty string. # # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.strip_value # def self.strip_value STRIP_VALUE end ## # Creates a key filter instance to match a row key using a regular expression. # # Matches only cells from rows whose row keys satisfy the given RE2 regex. In # other words, passes through the entire row when the key matches, and # otherwise produces an empty row. # Note that, since row keys can contain arbitrary bytes, the `\C` escape # sequence must be used if a true wildcard is desired. The `.` character # will not match the new line character `\n`, which may be present in a # binary key. # # For Regex syntax: # @see https://github.com/google/re2/wiki/Syntax # # @param regex [String] Regex to match row keys. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.key("user-.*") # def self.key regex SimpleFilter.new.key regex end ## # Creates a sample probability filter instance. # # Matches all cells from a row with probability p, and matches no cells # from the row with probability 1-p. # # @param probability [Float] Probability value # Probability must be greather then 0 and less then 1.0 # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.sample(0.5) # def self.sample probability SimpleFilter.new.sample probability end ## # Creates a family name match filter using a regular expression. # # Matches only cells from columns whose families satisfy the given RE2 # regex. For technical reasons, the regex must not contain the `:` # character, even if it is not being used as a literal. # Note that, since column families cannot contain the new line character # `\n`, it is sufficient to use `.` as a full wildcard when matching # column family names. # # For Regex syntax: # @see https://github.com/google/re2/wiki/Syntax # # @param regex [String] Regex to match family name. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.family("cf-.*") # def self.family regex SimpleFilter.new.family regex end ## # Creates a column qualifier match filter using a regular expression. # # Matches only cells from columns whose qualifiers satisfy the given RE2 # regex. # Note that, since column qualifiers can contain arbitrary bytes, the `\C` # escape sequence must be used if a true wildcard is desired. The `.` # character will not match the new line character `\n`, which may be # present in a binary qualifier. # # For Regex syntax: # @see https://github.com/google/re2/wiki/Syntax # # @param regex [String] Regex to match column qualifier name. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.qualifier("user-name.*") # def self.qualifier regex SimpleFilter.new.qualifier regex end ## # Creates a value match filter using a regular expression. # # Matches only cells with values that satisfy the given regular expression. # Note that, since cell values can contain arbitrary bytes, the `\C` escape # sequence must be used if a true wildcard is desired. The `.` character # will not match the new line character `\n`, which may be present in a # binary value. # # For Regex syntax: # @see https://github.com/google/re2/wiki/Syntax # # @param regex [String] Regex to match cell value. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.value("abc.*") # def self.value regex SimpleFilter.new.value regex end ## # Creates a label filter instance to apply a label based on the result of # read rows. # # Applies the given label to all cells in the output row. This allows # the client to determine which results were produced from which part of # the filter. # # Values must be at most 15 characters and match the RE2 # pattern `[a-z0-9\\-]+` # # Due to a technical limitation, it is not possible to apply # multiple labels to a cell. As a result, a chain may have no more than # one sub-filter that contains an `apply_label_transformer`. It is okay for # an interleave to contain multiple `apply_label_transformers`, as they # will be applied to separate copies of the input. # # @param value [String] Label name # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.label("user-detail") # def self.label value SimpleFilter.new.label value end ## # Creates a cell-per-row-offset filter instance to skip first N cells. # # Skips the first N cells of each row, matching all subsequent cells. # If duplicate cells are present, as is possible when using an interleave, # each copy of the cell is counted separately. # # @param offset [Integer] Offset value. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.cells_per_row_offset(3) # def self.cells_per_row_offset offset SimpleFilter.new.cells_per_row_offset offset end ## # Create a cells-per-row limit filter instance. # # Matches only the first N cells of each row. # If duplicate cells are present, as is possible when using an interleave, # each copy of the cell is counted separately. # # @param limit [String] Max cell match per row limit # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.cells_per_row(5) # def self.cells_per_row limit SimpleFilter.new.cells_per_row limit end ## # Creates cells-per-column filter instance. # # Matches only the most recent N cells within each column. # If duplicate cells are present, as is possible when using an interleave, # each copy of the cell is counted separately. # # @param limit [String] Max cell match per column limit # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # filter = Google::Cloud::Bigtable::RowFilter.cells_per_column(5) # def self.cells_per_column limit SimpleFilter.new.cells_per_column limit end ## # Creates a timestamp-range filter instance. # # Matches only cells with timestamps within the given range. # Specifies a contiguous range of timestamps. # # @param from [Integer] Inclusive lower bound. If left empty, interpreted as 0. # @param to [Integer] Exclusive upper bound. If left empty, interpreted as infinity. # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # # timestamp_micros = (Time.now.to_f * 1000000).round(-3) # from = timestamp_micros - 300000000 # to = timestamp_micros # # filter = Google::Cloud::Bigtable::RowFilter.timestamp_range(from: from, to: to) # # # From to infinity # filter = Google::Cloud::Bigtable::RowFilter.timestamp_range(from: from) # # # From 0 value to `to` # filter = Google::Cloud::Bigtable::RowFilter.timestamp_range(to: to) # def self.timestamp_range from: nil, to: nil SimpleFilter.new.timestamp_range from, to end ## # Creates a value-range filter instance. # # Matches only cells with values that fall within the given range. # # See {Google::Cloud::Bigtable::ValueRange#from} and { Google::Cloud::Bigtable::ValueRange#to} for range # option inclusive/exclusive options # # * The value at which to start the range. If neither field is set, interpreted # as an empty string, inclusive. # * The value at which to end the range. If neither field is set, interpreted # as an infinite string, exclusive. # # @param range [Google::Cloud::Bigtable::ValueRange] # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example Start to end range # require "google/cloud/bigtable" # # bigtable = Google::Cloud::Bigtable.new # table = bigtable.table("my-instance", "my-table") # # range = table.new_value_range.from("value-001", inclusive: false) # filter = Google::Cloud::Bigtable::RowFilter.value_range(range) # # @example Start exlusive to infinite end range # require "google/cloud/bigtable" # # bigtable = Google::Cloud::Bigtable.new # table = bigtable.table("my-instance", "my-table") # # range = table.new_value_range.from("value-001", inclusive: false) # filter = Google::Cloud::Bigtable::RowFilter.value_range(range) # def self.value_range range SimpleFilter.new.value_range range end ## # Creates a column-range filter instance. # # Matches only cells from columns within the given range. # # @param range [Google::Cloud::Bigtable::ColumnRange] # @return [Google::Cloud::Bigtable::RowFilter::SimpleFilter] # # @example # require "google/cloud/bigtable" # # range = Google::Cloud::Bigtable::ColumnRange.new("cf").from("field0").to("field5") # # filter = Google::Cloud::Bigtable::RowFilter.column_range(range) # def self.column_range range SimpleFilter.new.column_range range end end end end end