project.rb in google-cloud-bigquery-0.29.0

- old
+ new

@@ -17,11 +17,11 @@
 require "google/cloud/errors"
 require "google/cloud/bigquery/service"
 require "google/cloud/bigquery/credentials"
 require "google/cloud/bigquery/dataset"
 require "google/cloud/bigquery/job"
-require "google/cloud/bigquery/query_data"
+require "google/cloud/bigquery/external"
 require "google/cloud/bigquery/project/list"
 require "google/cloud/bigquery/time"
 require "google/cloud/bigquery/schema"
 
 module Google
@@ -126,10 +126,14 @@
         #   or named (`@myparam`) query parameters. If value passed is an array
         #   `["foo"]`, the query must use positional query parameters. If value
         #   passed is a hash `{ myparam: "foo" }`, the query must use named
         #   query parameters. When set, `legacy_sql` will automatically be set
         #   to false and `standard_sql` to true.
+        # @param [Hash<String|Symbol, External::DataSource>] external A Hash
+        #   that represents the mapping of the external tables to the table
+        #   names used in the SQL query. The hash keys are the table names, and
+        #   the hash values are the external table objects. See {Project#query}.
         # @param [String] priority Specifies a priority for the query. Possible
         #   values include `INTERACTIVE` and `BATCH`. The default value is
         #   `INTERACTIVE`.
         # @param [Boolean] cache Whether to look for the result in the query
         #   cache. The query cache is a best-effort cache that will be flushed
@@ -156,13 +160,13 @@
         #   * `append` - BigQuery appends the data to the table.
         #   * `empty` - A 'duplicate' error is returned in the job result if the
         #     table exists and contains data.
         # @param [Dataset, String] dataset The default dataset to use for
         #   unqualified table names in the query. Optional.
-        # @param [Boolean] large_results If `true`, allows the query to produce
-        #   arbitrarily large result tables at a slight cost in performance.
-        #   Requires `table` parameter to be set.
+        # @param [String] project Specifies the default projectId to assume for
+        #   any unqualified table names in the query. Only used if `dataset`
+        #   option is set.
         # @param [Boolean] standard_sql Specifies whether to use BigQuery's
         #   [standard
         #   SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/)
         #   dialect for this query. If set to true, the query will use standard
         #   SQL rather than the [legacy
@@ -190,11 +194,43 @@
         #   queries](https://cloud.google.com/bigquery/pricing#high-compute).
         # @param [Integer] maximum_bytes_billed Limits the bytes billed for this
         #   job. Queries that will have bytes billed beyond this limit will fail
         #   (without incurring a charge). Optional. If unspecified, this will be
         #   set to your project default.
+        # @param [String] job_id A user-defined ID for the query job. The ID
+        #   must contain only letters (a-z, A-Z), numbers (0-9), underscores
+        #   (_), or dashes (-). The maximum length is 1,024 characters. If
+        #   `job_id` is provided, then `prefix` will not be used.
         #
+        #   See [Generating a job
+        #   ID](https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid).
+        # @param [String] prefix A string, usually human-readable, that will be
+        #   prepended to a generated value to produce a unique job ID. For
+        #   example, the prefix `daily_import_job_` can be given to generate a
+        #   job ID such as `daily_import_job_12vEDtMQ0mbp1Mo5Z7mzAFQJZazh`. The
+        #   prefix must contain only letters (a-z, A-Z), numbers (0-9),
+        #   underscores (_), or dashes (-). The maximum length of the entire ID
+        #   is 1,024 characters. If `job_id` is provided, then `prefix` will not
+        #   be used.
+        #
+        #   See [Generating a job
+        #   ID](https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid).
+        # @param [Hash] labels A hash of user-provided labels associated with
+        #   the job. You can use these to organize and group your jobs. Label
+        #   keys and values can be no longer than 63 characters, can only
+        #   contain lowercase letters, numeric characters, underscores and
+        #   dashes. International characters are allowed. Label values are
+        #   optional. Label keys must start with a letter and each label in the
+        #   list must have a different key.
+        # @param [Array<String>, String] udfs User-defined function resources
+        #   used in the query. May be either a code resource to load from a
+        #   Google Cloud Storage URI (`gs://bucket/path`), or an inline resource
+        #   that contains code for a user-defined function (UDF). Providing an
+        #   inline code resource is equivalent to providing a URI for a file
+        #   containing the same code. See [User-Defined
+        #   Functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions).
+        #
         # @return [Google::Cloud::Bigquery::QueryJob]
         #
         # @example Query using standard SQL:
         #   require "google/cloud/bigquery"
         #
@@ -203,11 +239,11 @@
         #   job = bigquery.query_job "SELECT name FROM " \
         #                            "`my_project.my_dataset.my_table`"
         #
         #   job.wait_until_done!
         #   if !job.failed?
-        #     job.query_results.each do |row|
+        #     job.data.each do |row|
         #       puts row[:name]
         #     end
         #   end
         #
         # @example Query using legacy SQL:
@@ -219,11 +255,11 @@
         #                            "[my_project:my_dataset.my_table]",
         #                            legacy_sql: true
         #
         #   job.wait_until_done!
         #   if !job.failed?
-        #     job.query_results.each do |row|
+        #     job.data.each do |row|
         #       puts row[:name]
         #     end
         #   end
         #
         # @example Query using positional query parameters:
@@ -236,11 +272,11 @@
         #                            " WHERE id = ?",
         #                            params: [1]
         #
         #   job.wait_until_done!
         #   if !job.failed?
-        #     job.query_results.each do |row|
+        #     job.data.each do |row|
         #       puts row[:name]
         #     end
         #   end
         #
         # @example Query using named query parameters:
@@ -253,36 +289,62 @@
         #                            " WHERE id = @id",
         #                            params: { id: 1 }
         #
         #   job.wait_until_done!
         #   if !job.failed?
-        #     job.query_results.each do |row|
+        #     job.data.each do |row|
         #       puts row[:name]
         #     end
         #   end
         #
-        def query_job query, params: nil, priority: "INTERACTIVE", cache: true,
-                      table: nil, create: nil, write: nil, dataset: nil,
+        # @example Query using external data source:
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   csv_url = "gs://bucket/path/to/data.csv"
+        #   csv_table = bigquery.external csv_url do |csv|
+        #     csv.autodetect = true
+        #     csv.skip_leading_rows = 1
+        #   end
+        #
+        #   job = bigquery.query_job "SELECT * FROM my_ext_table",
+        #                            external: { my_ext_table: csv_table }
+        #
+        #   job.wait_until_done!
+        #   if !job.failed?
+        #     job.data.each do |row|
+        #       puts row[:name]
+        #     end
+        #   end
+        #
+        def query_job query, params: nil, external: nil,
+                      priority: "INTERACTIVE", cache: true, table: nil,
+                      create: nil, write: nil, dataset: nil, project: nil,
                       standard_sql: nil, legacy_sql: nil, large_results: nil,
                       flatten: nil, maximum_billing_tier: nil,
-                      maximum_bytes_billed: nil
+                      maximum_bytes_billed: nil, job_id: nil, prefix: nil,
+                      labels: nil, udfs: nil
           ensure_service!
           options = { priority: priority, cache: cache, table: table,
                       create: create, write: write,
                       large_results: large_results, flatten: flatten,
-                      dataset: dataset, legacy_sql: legacy_sql,
-                      standard_sql: standard_sql,
+                      dataset: dataset, project: project,
+                      legacy_sql: legacy_sql, standard_sql: standard_sql,
                       maximum_billing_tier: maximum_billing_tier,
                       maximum_bytes_billed: maximum_bytes_billed,
-                      params: params }
+                      params: params, external: external, labels: labels,
+                      job_id: job_id, prefix: prefix, udfs: udfs }
           gapi = service.query_job query, options
           Job.from_gapi gapi, service
         end
 
         ##
-        # Queries data using the [synchronous
-        # method](https://cloud.google.com/bigquery/querying-data).
+        # Queries data using a synchronous method that blocks for a response. In
+        # this method, a {QueryJob} is created and its results are saved
+        # to a temporary table, then read from the table. Timeouts and transient
+        # errors are generally handled as needed to complete the query.
         #
         # When using standard SQL and passing arguments using `params`, Ruby
         # types are mapped to BigQuery types as follows:
         #
         # | BigQuery    | Ruby           | Notes  |
@@ -300,10 +362,12 @@
         # | `STRUCT`    | `Hash`        | Hash keys may be strings or symbols. |
         #
         # See [Data Types](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types)
         # for an overview of each BigQuery data type, including allowed values.
         #
+        # @see https://cloud.google.com/bigquery/querying-data Querying Data
+        #
         # @param [String] query A query string, following the BigQuery [query
         #   syntax](https://cloud.google.com/bigquery/query-reference), of the
         #   query to execute. Example: "SELECT count(f1) FROM
         #   [myProjectId:myDatasetId.myTableId]".
         # @param [Array, Hash] params Standard SQL only. Used to pass query
@@ -311,26 +375,20 @@
         #   or named (`@myparam`) query parameters. If value passed is an array
         #   `["foo"]`, the query must use positional query parameters. If value
         #   passed is a hash `{ myparam: "foo" }`, the query must use named
         #   query parameters. When set, `legacy_sql` will automatically be set
         #   to false and `standard_sql` to true.
+        # @param [Hash<String|Symbol, External::DataSource>] external A Hash
+        #   that represents the mapping of the external tables to the table
+        #   names used in the SQL query. The hash keys are the table names, and
+        #   the hash values are the external table objects. See {Project#query}.
         # @param [Integer] max The maximum number of rows of data to return per
         #   page of results. Setting this flag to a small value such as 1000 and
         #   then paging through results might improve reliability when the query
         #   result set is large. In addition to this limit, responses are also
         #   limited to 10 MB. By default, there is no maximum row count, and
         #   only the byte limit applies.
-        # @param [Integer] timeout How long to wait for the query to complete,
-        #   in milliseconds, before the request times out and returns. Note that
-        #   this is only a timeout for the request, not the query. If the query
-        #   takes longer to run than the timeout value, the call returns without
-        #   any results and with QueryData#complete? set to false. The default
-        #   value is 10000 milliseconds (10 seconds).
-        # @param [Boolean] dryrun If set to `true`, BigQuery doesn't run the
-        #   job. Instead, if the query is valid, BigQuery returns statistics
-        #   about the job such as how many bytes would be processed. If the
-        #   query is invalid, an error returns. The default value is `false`.
         # @param [Boolean] cache Whether to look for the result in the query
         #   cache. The query cache is a best-effort cache that will be flushed
         #   whenever tables in the query are modified. The default value is
         #   true. For more information, see [query
         #   caching](https://developers.google.com/bigquery/querying-data).
@@ -359,11 +417,11 @@
         #   SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/)
         #   When set to false, the values of `large_results` and `flatten` are
         #   ignored; the query will be run as if `large_results` is true and
         #   `flatten` is false. Optional. The default value is false.
         #
-        # @return [Google::Cloud::Bigquery::QueryData]
+        # @return [Google::Cloud::Bigquery::Data]
         #
         # @example Query using standard SQL:
         #   require "google/cloud/bigquery"
         #
         #   bigquery = Google::Cloud::Bigquery.new
@@ -385,11 +443,11 @@
         #
         #   data.each do |row|
         #     puts row[:name]
         #   end
         #
-        # @example Retrieve all rows: (See {QueryData#all})
+        # @example Retrieve all rows: (See {Data#all})
         #   require "google/cloud/bigquery"
         #
         #   bigquery = Google::Cloud::Bigquery.new
         #
         #   data = bigquery.query "SELECT name FROM `my_dataset.my_table`"
@@ -424,23 +482,103 @@
         #
         #   data.each do |row|
         #     puts row[:name]
         #   end
         #
-        def query query, params: nil, max: nil, timeout: 10000, dryrun: nil,
-                  cache: true, dataset: nil, project: nil, standard_sql: nil,
-                  legacy_sql: nil
+        # @example Query using external data source:
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   csv_url = "gs://bucket/path/to/data.csv"
+        #   csv_table = bigquery.external csv_url do |csv|
+        #     csv.autodetect = true
+        #     csv.skip_leading_rows = 1
+        #   end
+        #
+        #   data = bigquery.query "SELECT * FROM my_ext_table",
+        #                         external: { my_ext_table: csv_table }
+        #
+        #   data.each do |row|
+        #     puts row[:name]
+        #   end
+        #
+        def query query, params: nil, external: nil, max: nil, cache: true,
+                  dataset: nil, project: nil, standard_sql: nil, legacy_sql: nil
           ensure_service!
-          options = { max: max, timeout: timeout, dryrun: dryrun, cache: cache,
-                      dataset: dataset, project: project,
+          options = { cache: cache, dataset: dataset, project: project,
                       legacy_sql: legacy_sql, standard_sql: standard_sql,
-                      params: params }
-          gapi = service.query query, options
-          QueryData.from_gapi gapi, service
+                      params: params, external: external }
+
+          job = query_job query, options
+          job.wait_until_done!
+
+          if job.failed?
+            begin
+              # raise to activate ruby exception cause handling
+              fail job.gapi_error
+            rescue => e
+              # wrap Google::Apis::Error with Google::Cloud::Error
+              raise Google::Cloud::Error.from_error(e)
+            end
+          end
+
+          job.data max: max
         end
 
         ##
+        # Creates a new External::DataSource (or subclass) object that
+        # represents the external data source that can be queried from directly,
+        # even though the data is not stored in BigQuery. Instead of loading or
+        # streaming the data, this object references the external data source.
+        #
+        # @see https://cloud.google.com/bigquery/external-data-sources Querying
+        #   External Data Sources
+        #
+        # @param [String, Array<String>] url The fully-qualified URL(s) that
+        #   point to your data in Google Cloud. An attempt will be made to
+        #   derive the format from the URLs provided.
+        # @param [String|Symbol] format The data format. This value will be used
+        #   even if the provided URLs are recognized as a different format.
+        #   Optional.
+        #
+        #   The following values are supported:
+        #
+        #   * `csv` - CSV
+        #   * `json` - [Newline-delimited JSON](http://jsonlines.org/)
+        #   * `avro` - [Avro](http://avro.apache.org/)
+        #   * `sheets` - Google Sheets
+        #   * `datastore_backup` - Cloud Datastore backup
+        #   * `bigtable` - Bigtable
+        #
+        # @return [External::DataSource] External data source.
+        #
+        # @example
+        #   require "google/cloud/bigquery"
+        #
+        #   bigquery = Google::Cloud::Bigquery.new
+        #
+        #   csv_url = "gs://bucket/path/to/data.csv"
+        #   csv_table = bigquery.external csv_url do |csv|
+        #     csv.autodetect = true
+        #     csv.skip_leading_rows = 1
+        #   end
+        #
+        #   data = bigquery.query "SELECT * FROM my_ext_table",
+        #                         external: { my_ext_table: csv_table }
+        #
+        #   data.each do |row|
+        #     puts row[:name]
+        #   end
+        #
+        def external url, format: nil
+          ext = External.from_urls url, format
+          yield ext if block_given?
+          ext
+        end
+
+        ##
         # Retrieves an existing dataset by ID.
         #
         # @param [String] dataset_id The ID of a dataset.
         #
         # @return [Google::Cloud::Bigquery::Dataset, nil] Returns `nil` if the
@@ -537,10 +675,15 @@
         ##
         # Retrieves the list of datasets belonging to the project.
         #
         # @param [Boolean] all Whether to list all datasets, including hidden
         #   ones. The default is `false`.
+        # @param [String] filter An expression for filtering the results of the
+        #   request by label. The syntax is `labels.<name>[:<value>]`.
+        #   Multiple filters can be `AND`ed together by connecting with a space.
+        #   Example: `labels.department:receiving labels.active`. See [Filtering
+        #   datasets using labels](https://cloud.google.com/bigquery/docs/labeling-datasets#filtering_datasets_using_labels).
         # @param [String] token A previously-returned page token representing
         #   part of the larger set of results to view.
         # @param [Integer] max Maximum number of datasets to return.
         #
         # @return [Array<Google::Cloud::Bigquery::Dataset>] (See
@@ -571,15 +714,15 @@
         #   datasets = bigquery.datasets
         #   datasets.all do |dataset|
         #     puts dataset.name
         #   end
         #
-        def datasets all: nil, token: nil, max: nil
+        def datasets all: nil, filter: nil, token: nil, max: nil
           ensure_service!
-          options = { all: all, token: token, max: max }
+          options = { all: all, filter: filter, token: token, max: max }
           gapi = service.list_datasets options
-          Dataset::List.from_gapi gapi, service, all, max
+          Dataset::List.from_gapi gapi, service, all, filter, max
         end
 
         ##
         # Retrieves an existing job by ID.
         #
@@ -755,14 +898,14 @@
         ##
         # Creates a new schema instance. An optional block may be given to
         # configure the schema, otherwise the schema is returned empty and may
         # be configured directly.
         #
-        # The returned schema can be passed to {Dataset#load} using the `schema`
-        # option. However, for most use cases, the block yielded by
-        # {Dataset#load} is a more convenient way to configure the schema for
-        # the destination table.
+        # The returned schema can be passed to {Dataset#load} using the
+        # `schema` option. However, for most use cases, the block yielded by
+        # {Dataset#load} is a more convenient way to configure the schema
+        # for the destination table.
         #
         # @yield [schema] a block for setting the schema
         # @yieldparam [Schema] schema the object accepting the schema
         #
         # @return [Google::Cloud::Bigquery::Schema]
@@ -781,10 +924,10 @@
         #   end
         #
         #   dataset = bigquery.dataset "my_dataset"
         #
         #   gs_url = "gs://my-bucket/file-name.csv"
-        #   load_job = dataset.load "my_new_table", gs_url, schema: schema
+        #   load_job = dataset.load_job "my_new_table", gs_url, schema: schema
         #
         def schema
           s = Schema.from_gapi
           yield s if block_given?
           s