# Copyright 2015 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. module Google module Cloud module Bigquery ## # # ExtractJob # # A {Job} subclass representing an export operation that may be performed # on a {Table} or {Model}. A ExtractJob instance is returned when you call # {Project#extract_job}, {Table#extract_job} or {Model#extract_job}. # # @see https://cloud.google.com/bigquery/docs/exporting-data # Exporting table data # @see https://cloud.google.com/bigquery-ml/docs/exporting-models # Exporting models # @see https://cloud.google.com/bigquery/docs/reference/v2/jobs Jobs API # reference # # @example Export table data # require "google/cloud/bigquery" # # bigquery = Google::Cloud::Bigquery.new # dataset = bigquery.dataset "my_dataset" # table = dataset.table "my_table" # # extract_job = table.extract_job "gs://my-bucket/file-name.json", # format: "json" # extract_job.wait_until_done! # extract_job.done? #=> true # # @example Export a model # require "google/cloud/bigquery" # # bigquery = Google::Cloud::Bigquery.new # dataset = bigquery.dataset "my_dataset" # model = dataset.model "my_model" # # extract_job = model.extract_job "gs://my-bucket/#{model.model_id}" # # extract_job.wait_until_done! # extract_job.done? #=> true # class ExtractJob < Job ## # The URI or URIs representing the Google Cloud Storage files to which # the data is exported. def destinations Array @gapi.configuration.extract.destination_uris end ## # The table or model which is exported. # # @return [Table, Model, nil] A table or model instance, or `nil`. # def source if (table = @gapi.configuration.extract.source_table) retrieve_table table.project_id, table.dataset_id, table.table_id elsif (model = @gapi.configuration.extract.source_model) retrieve_model model.project_id, model.dataset_id, model.model_id end end ## # Whether the source of the export job is a table. See {#source}. # # @return [Boolean] `true` when the source is a table, `false` # otherwise. # def table? !@gapi.configuration.extract.source_table.nil? end ## # Whether the source of the export job is a model. See {#source}. # # @return [Boolean] `true` when the source is a model, `false` # otherwise. # def model? !@gapi.configuration.extract.source_model.nil? end ## # Checks if the export operation compresses the data using gzip. The # default is `false`. Not applicable when extracting models. # # @return [Boolean] `true` when `GZIP`, `false` if not `GZIP` or not a # table extraction. def compression? return false unless table? @gapi.configuration.extract.compression == "GZIP" end ## # Checks if the destination format for the table data is [newline-delimited # JSON](https://jsonlines.org/). The default is `false`. Not applicable when # extracting models. # # @return [Boolean] `true` when `NEWLINE_DELIMITED_JSON`, `false` if not # `NEWLINE_DELIMITED_JSON` or not a table extraction. # def json? return false unless table? @gapi.configuration.extract.destination_format == "NEWLINE_DELIMITED_JSON" end ## # Checks if the destination format for the table data is CSV. Tables with # nested or repeated fields cannot be exported as CSV. The default is # `true` for tables. Not applicable when extracting models. # # @return [Boolean] `true` when `CSV`, or `false` if not `CSV` or not a # table extraction. # def csv? return false unless table? val = @gapi.configuration.extract.destination_format return true if val.nil? val == "CSV" end ## # Checks if the destination format for the table data is # [Avro](http://avro.apache.org/). The default is `false`. Not applicable # when extracting models. # # @return [Boolean] `true` when `AVRO`, `false` if not `AVRO` or not a # table extraction. # def avro? return false unless table? @gapi.configuration.extract.destination_format == "AVRO" end ## # Checks if the destination format for the model is TensorFlow SavedModel. # The default is `true` for models. Not applicable when extracting tables. # # @return [Boolean] `true` when `ML_TF_SAVED_MODEL`, `false` if not # `ML_TF_SAVED_MODEL` or not a model extraction. # def ml_tf_saved_model? return false unless model? val = @gapi.configuration.extract.destination_format return true if val.nil? val == "ML_TF_SAVED_MODEL" end ## # Checks if the destination format for the model is XGBoost. The default # is `false`. Not applicable when extracting tables. # # @return [Boolean] `true` when `ML_XGBOOST_BOOSTER`, `false` if not # `ML_XGBOOST_BOOSTER` or not a model extraction. # def ml_xgboost_booster? return false unless model? @gapi.configuration.extract.destination_format == "ML_XGBOOST_BOOSTER" end ## # The character or symbol the operation uses to delimit fields in the # exported data. The default is a comma (,) for tables. Not applicable # when extracting models. # # @return [String, nil] A string containing the character, such as `","`, # `nil` if not a table extraction. # def delimiter return unless table? val = @gapi.configuration.extract.field_delimiter val = "," if val.nil? val end ## # Checks if the exported data contains a header row. The default is # `true` for tables. Not applicable when extracting models. # # @return [Boolean] `true` when the print header configuration is # present or `nil`, `false` if disabled or not a table extraction. # def print_header? return false unless table? val = @gapi.configuration.extract.print_header val = true if val.nil? val end ## # The number of files per destination URI or URI pattern specified in # {#destinations}. # # @return [Array] An array of values in the same order as the # URI patterns. # def destinations_file_counts Array @gapi.statistics.extract.destination_uri_file_counts end ## # A hash containing the URI or URI pattern specified in # {#destinations} mapped to the counts of files per destination. # # @return [Hash] A Hash with the URI patterns as keys # and the counts as values. # def destinations_counts destinations.zip(destinations_file_counts).to_h end ## # If `#avro?` (`#format` is set to `"AVRO"`), this flag indicates # whether to enable extracting applicable column types (such as # `TIMESTAMP`) to their corresponding AVRO logical types # (`timestamp-micros`), instead of only using their raw types # (`avro-long`). Not applicable when extracting models. # # @return [Boolean] `true` when applicable column types will use their # corresponding AVRO logical types, `false` if not enabled or not a # table extraction. # def use_avro_logical_types? return false unless table? @gapi.configuration.extract.use_avro_logical_types end ## # Yielded to a block to accumulate changes for an API request. class Updater < ExtractJob ## # @private Create an Updater object. def initialize gapi super() @gapi = gapi end ## # @private Create an Updater from an options hash. # # @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job # configuration object for setting query options. def self.from_options service, source, storage_files, options job_ref = service.job_ref_from options[:job_id], options[:prefix] storage_urls = Array(storage_files).map do |url| url.respond_to?(:to_gs_url) ? url.to_gs_url : url end options[:format] ||= Convert.derive_source_format storage_urls.first extract_config = Google::Apis::BigqueryV2::JobConfigurationExtract.new( destination_uris: Array(storage_urls) ) case source when Google::Apis::BigqueryV2::TableReference extract_config.source_table = source when Google::Apis::BigqueryV2::ModelReference extract_config.source_model = source end job = Google::Apis::BigqueryV2::Job.new( job_reference: job_ref, configuration: Google::Apis::BigqueryV2::JobConfiguration.new( extract: extract_config, dry_run: options[:dryrun] ) ) from_job_and_options job, options end ## # @private Create an Updater from a Job and options hash. # # @return [Google::Cloud::Bigquery::ExtractJob::Updater] A job # configuration object for setting query options. def self.from_job_and_options request, options updater = ExtractJob::Updater.new request updater.compression = options[:compression] updater.delimiter = options[:delimiter] updater.format = options[:format] updater.header = options[:header] updater.labels = options[:labels] if options[:labels] unless options[:use_avro_logical_types].nil? updater.use_avro_logical_types = options[:use_avro_logical_types] end updater end ## # Sets the geographic location where the job should run. Required # except for US and EU. # # @param [String] value A geographic location, such as "US", "EU" or # "asia-northeast1". Required except for US and EU. # # @example # require "google/cloud/bigquery" # # bigquery = Google::Cloud::Bigquery.new # dataset = bigquery.dataset "my_dataset" # table = dataset.table "my_table" # # destination = "gs://my-bucket/file-name.csv" # extract_job = table.extract_job destination do |j| # j.location = "EU" # end # # extract_job.wait_until_done! # extract_job.done? #=> true # # @!group Attributes def location= value @gapi.job_reference.location = value return unless value.nil? # Treat assigning value of nil the same as unsetting the value. unset = @gapi.job_reference.instance_variables.include? :@location @gapi.job_reference.remove_instance_variable :@location if unset end ## # Sets the compression type. Not applicable when extracting models. # # @param [String] value The compression type to use for exported # files. Possible values include `GZIP` and `NONE`. The default # value is `NONE`. # # @!group Attributes def compression= value @gapi.configuration.extract.compression = value end ## # Sets the field delimiter. Not applicable when extracting models. # # @param [String] value Delimiter to use between fields in the # exported data. Default is ,. # # @!group Attributes def delimiter= value @gapi.configuration.extract.field_delimiter = value end ## # Sets the destination file format. The default value for # tables is `csv`. Tables with nested or repeated fields cannot be # exported as CSV. The default value for models is `ml_tf_saved_model`. # # Supported values for tables: # # * `csv` - CSV # * `json` - [Newline-delimited JSON](https://jsonlines.org/) # * `avro` - [Avro](http://avro.apache.org/) # # Supported values for models: # # * `ml_tf_saved_model` - TensorFlow SavedModel # * `ml_xgboost_booster` - XGBoost Booster # # @param [String] new_format The new source format. # # @!group Attributes # def format= new_format @gapi.configuration.extract.update! destination_format: Convert.source_format(new_format) end ## # Print a header row in the exported file. Not applicable when # extracting models. # # @param [Boolean] value Whether to print out a header row in the # results. Default is `true`. # # @!group Attributes def header= value @gapi.configuration.extract.print_header = value end ## # Sets the labels to use for the job. # # @param [Hash] value A hash of user-provided labels associated with # the job. You can use these to organize and group your jobs. # # The labels applied to a resource must meet the following requirements: # # * Each resource can have multiple labels, up to a maximum of 64. # * Each label must be a key-value pair. # * Keys have a minimum length of 1 character and a maximum length of # 63 characters, and cannot be empty. Values can be empty, and have # a maximum length of 63 characters. # * Keys and values can contain only lowercase letters, numeric characters, # underscores, and dashes. All characters must use UTF-8 encoding, and # international characters are allowed. # * The key portion of a label must be unique. However, you can use the # same key with multiple resources. # * Keys must start with a lowercase letter or international character. # # @!group Attributes # def labels= value @gapi.configuration.update! labels: value end ## # Indicate whether to enable extracting applicable column types (such # as `TIMESTAMP`) to their corresponding AVRO logical types # (`timestamp-micros`), instead of only using their raw types # (`avro-long`). # # Only used when `#format` is set to `"AVRO"` (`#avro?`). # # @param [Boolean] value Whether applicable column types will use # their corresponding AVRO logical types. # # @!group Attributes def use_avro_logical_types= value @gapi.configuration.extract.use_avro_logical_types = value end def cancel raise "not implemented in #{self.class}" end def rerun! raise "not implemented in #{self.class}" end def reload! raise "not implemented in #{self.class}" end alias refresh! reload! def wait_until_done! raise "not implemented in #{self.class}" end ## # @private Returns the Google API client library version of this job. # # @return [] (See # {Google::Apis::BigqueryV2::Job}) def to_gapi @gapi end end protected def retrieve_model project_id, dataset_id, model_id ensure_service! gapi = service.get_project_model project_id, dataset_id, model_id Model.from_gapi_json gapi, service rescue Google::Cloud::NotFoundError nil end end end end end