lib/google/cloud/bigquery/project.rb in google-cloud-bigquery-0.28.0 vs lib/google/cloud/bigquery/project.rb in google-cloud-bigquery-0.29.0

- old
+ new

@@ -17,11 +17,11 @@ require "google/cloud/errors" require "google/cloud/bigquery/service" require "google/cloud/bigquery/credentials" require "google/cloud/bigquery/dataset" require "google/cloud/bigquery/job" -require "google/cloud/bigquery/query_data" +require "google/cloud/bigquery/external" require "google/cloud/bigquery/project/list" require "google/cloud/bigquery/time" require "google/cloud/bigquery/schema" module Google @@ -126,10 +126,14 @@ # or named (`@myparam`) query parameters. If value passed is an array # `["foo"]`, the query must use positional query parameters. If value # passed is a hash `{ myparam: "foo" }`, the query must use named # query parameters. When set, `legacy_sql` will automatically be set # to false and `standard_sql` to true. + # @param [Hash<String|Symbol, External::DataSource>] external A Hash + # that represents the mapping of the external tables to the table + # names used in the SQL query. The hash keys are the table names, and + # the hash values are the external table objects. See {Project#query}. # @param [String] priority Specifies a priority for the query. Possible # values include `INTERACTIVE` and `BATCH`. The default value is # `INTERACTIVE`. # @param [Boolean] cache Whether to look for the result in the query # cache. The query cache is a best-effort cache that will be flushed @@ -156,13 +160,13 @@ # * `append` - BigQuery appends the data to the table. # * `empty` - A 'duplicate' error is returned in the job result if the # table exists and contains data. # @param [Dataset, String] dataset The default dataset to use for # unqualified table names in the query. Optional. - # @param [Boolean] large_results If `true`, allows the query to produce - # arbitrarily large result tables at a slight cost in performance. - # Requires `table` parameter to be set. + # @param [String] project Specifies the default projectId to assume for + # any unqualified table names in the query. Only used if `dataset` + # option is set. # @param [Boolean] standard_sql Specifies whether to use BigQuery's # [standard # SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/) # dialect for this query. If set to true, the query will use standard # SQL rather than the [legacy @@ -190,11 +194,43 @@ # queries](https://cloud.google.com/bigquery/pricing#high-compute). # @param [Integer] maximum_bytes_billed Limits the bytes billed for this # job. Queries that will have bytes billed beyond this limit will fail # (without incurring a charge). Optional. If unspecified, this will be # set to your project default. + # @param [String] job_id A user-defined ID for the query job. The ID + # must contain only letters (a-z, A-Z), numbers (0-9), underscores + # (_), or dashes (-). The maximum length is 1,024 characters. If + # `job_id` is provided, then `prefix` will not be used. # + # See [Generating a job + # ID](https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid). + # @param [String] prefix A string, usually human-readable, that will be + # prepended to a generated value to produce a unique job ID. For + # example, the prefix `daily_import_job_` can be given to generate a + # job ID such as `daily_import_job_12vEDtMQ0mbp1Mo5Z7mzAFQJZazh`. The + # prefix must contain only letters (a-z, A-Z), numbers (0-9), + # underscores (_), or dashes (-). The maximum length of the entire ID + # is 1,024 characters. If `job_id` is provided, then `prefix` will not + # be used. + # + # See [Generating a job + # ID](https://cloud.google.com/bigquery/docs/managing-jobs#generate-jobid). + # @param [Hash] labels A hash of user-provided labels associated with + # the job. You can use these to organize and group your jobs. Label + # keys and values can be no longer than 63 characters, can only + # contain lowercase letters, numeric characters, underscores and + # dashes. International characters are allowed. Label values are + # optional. Label keys must start with a letter and each label in the + # list must have a different key. + # @param [Array<String>, String] udfs User-defined function resources + # used in the query. May be either a code resource to load from a + # Google Cloud Storage URI (`gs://bucket/path`), or an inline resource + # that contains code for a user-defined function (UDF). Providing an + # inline code resource is equivalent to providing a URI for a file + # containing the same code. See [User-Defined + # Functions](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions). + # # @return [Google::Cloud::Bigquery::QueryJob] # # @example Query using standard SQL: # require "google/cloud/bigquery" # @@ -203,11 +239,11 @@ # job = bigquery.query_job "SELECT name FROM " \ # "`my_project.my_dataset.my_table`" # # job.wait_until_done! # if !job.failed? - # job.query_results.each do |row| + # job.data.each do |row| # puts row[:name] # end # end # # @example Query using legacy SQL: @@ -219,11 +255,11 @@ # "[my_project:my_dataset.my_table]", # legacy_sql: true # # job.wait_until_done! # if !job.failed? - # job.query_results.each do |row| + # job.data.each do |row| # puts row[:name] # end # end # # @example Query using positional query parameters: @@ -236,11 +272,11 @@ # " WHERE id = ?", # params: [1] # # job.wait_until_done! # if !job.failed? - # job.query_results.each do |row| + # job.data.each do |row| # puts row[:name] # end # end # # @example Query using named query parameters: @@ -253,36 +289,62 @@ # " WHERE id = @id", # params: { id: 1 } # # job.wait_until_done! # if !job.failed? - # job.query_results.each do |row| + # job.data.each do |row| # puts row[:name] # end # end # - def query_job query, params: nil, priority: "INTERACTIVE", cache: true, - table: nil, create: nil, write: nil, dataset: nil, + # @example Query using external data source: + # require "google/cloud/bigquery" + # + # bigquery = Google::Cloud::Bigquery.new + # + # csv_url = "gs://bucket/path/to/data.csv" + # csv_table = bigquery.external csv_url do |csv| + # csv.autodetect = true + # csv.skip_leading_rows = 1 + # end + # + # job = bigquery.query_job "SELECT * FROM my_ext_table", + # external: { my_ext_table: csv_table } + # + # job.wait_until_done! + # if !job.failed? + # job.data.each do |row| + # puts row[:name] + # end + # end + # + def query_job query, params: nil, external: nil, + priority: "INTERACTIVE", cache: true, table: nil, + create: nil, write: nil, dataset: nil, project: nil, standard_sql: nil, legacy_sql: nil, large_results: nil, flatten: nil, maximum_billing_tier: nil, - maximum_bytes_billed: nil + maximum_bytes_billed: nil, job_id: nil, prefix: nil, + labels: nil, udfs: nil ensure_service! options = { priority: priority, cache: cache, table: table, create: create, write: write, large_results: large_results, flatten: flatten, - dataset: dataset, legacy_sql: legacy_sql, - standard_sql: standard_sql, + dataset: dataset, project: project, + legacy_sql: legacy_sql, standard_sql: standard_sql, maximum_billing_tier: maximum_billing_tier, maximum_bytes_billed: maximum_bytes_billed, - params: params } + params: params, external: external, labels: labels, + job_id: job_id, prefix: prefix, udfs: udfs } gapi = service.query_job query, options Job.from_gapi gapi, service end ## - # Queries data using the [synchronous - # method](https://cloud.google.com/bigquery/querying-data). + # Queries data using a synchronous method that blocks for a response. In + # this method, a {QueryJob} is created and its results are saved + # to a temporary table, then read from the table. Timeouts and transient + # errors are generally handled as needed to complete the query. # # When using standard SQL and passing arguments using `params`, Ruby # types are mapped to BigQuery types as follows: # # | BigQuery | Ruby | Notes | @@ -300,10 +362,12 @@ # | `STRUCT` | `Hash` | Hash keys may be strings or symbols. | # # See [Data Types](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types) # for an overview of each BigQuery data type, including allowed values. # + # @see https://cloud.google.com/bigquery/querying-data Querying Data + # # @param [String] query A query string, following the BigQuery [query # syntax](https://cloud.google.com/bigquery/query-reference), of the # query to execute. Example: "SELECT count(f1) FROM # [myProjectId:myDatasetId.myTableId]". # @param [Array, Hash] params Standard SQL only. Used to pass query @@ -311,26 +375,20 @@ # or named (`@myparam`) query parameters. If value passed is an array # `["foo"]`, the query must use positional query parameters. If value # passed is a hash `{ myparam: "foo" }`, the query must use named # query parameters. When set, `legacy_sql` will automatically be set # to false and `standard_sql` to true. + # @param [Hash<String|Symbol, External::DataSource>] external A Hash + # that represents the mapping of the external tables to the table + # names used in the SQL query. The hash keys are the table names, and + # the hash values are the external table objects. See {Project#query}. # @param [Integer] max The maximum number of rows of data to return per # page of results. Setting this flag to a small value such as 1000 and # then paging through results might improve reliability when the query # result set is large. In addition to this limit, responses are also # limited to 10 MB. By default, there is no maximum row count, and # only the byte limit applies. - # @param [Integer] timeout How long to wait for the query to complete, - # in milliseconds, before the request times out and returns. Note that - # this is only a timeout for the request, not the query. If the query - # takes longer to run than the timeout value, the call returns without - # any results and with QueryData#complete? set to false. The default - # value is 10000 milliseconds (10 seconds). - # @param [Boolean] dryrun If set to `true`, BigQuery doesn't run the - # job. Instead, if the query is valid, BigQuery returns statistics - # about the job such as how many bytes would be processed. If the - # query is invalid, an error returns. The default value is `false`. # @param [Boolean] cache Whether to look for the result in the query # cache. The query cache is a best-effort cache that will be flushed # whenever tables in the query are modified. The default value is # true. For more information, see [query # caching](https://developers.google.com/bigquery/querying-data). @@ -359,11 +417,11 @@ # SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/) # When set to false, the values of `large_results` and `flatten` are # ignored; the query will be run as if `large_results` is true and # `flatten` is false. Optional. The default value is false. # - # @return [Google::Cloud::Bigquery::QueryData] + # @return [Google::Cloud::Bigquery::Data] # # @example Query using standard SQL: # require "google/cloud/bigquery" # # bigquery = Google::Cloud::Bigquery.new @@ -385,11 +443,11 @@ # # data.each do |row| # puts row[:name] # end # - # @example Retrieve all rows: (See {QueryData#all}) + # @example Retrieve all rows: (See {Data#all}) # require "google/cloud/bigquery" # # bigquery = Google::Cloud::Bigquery.new # # data = bigquery.query "SELECT name FROM `my_dataset.my_table`" @@ -424,23 +482,103 @@ # # data.each do |row| # puts row[:name] # end # - def query query, params: nil, max: nil, timeout: 10000, dryrun: nil, - cache: true, dataset: nil, project: nil, standard_sql: nil, - legacy_sql: nil + # @example Query using external data source: + # require "google/cloud/bigquery" + # + # bigquery = Google::Cloud::Bigquery.new + # + # csv_url = "gs://bucket/path/to/data.csv" + # csv_table = bigquery.external csv_url do |csv| + # csv.autodetect = true + # csv.skip_leading_rows = 1 + # end + # + # data = bigquery.query "SELECT * FROM my_ext_table", + # external: { my_ext_table: csv_table } + # + # data.each do |row| + # puts row[:name] + # end + # + def query query, params: nil, external: nil, max: nil, cache: true, + dataset: nil, project: nil, standard_sql: nil, legacy_sql: nil ensure_service! - options = { max: max, timeout: timeout, dryrun: dryrun, cache: cache, - dataset: dataset, project: project, + options = { cache: cache, dataset: dataset, project: project, legacy_sql: legacy_sql, standard_sql: standard_sql, - params: params } - gapi = service.query query, options - QueryData.from_gapi gapi, service + params: params, external: external } + + job = query_job query, options + job.wait_until_done! + + if job.failed? + begin + # raise to activate ruby exception cause handling + fail job.gapi_error + rescue => e + # wrap Google::Apis::Error with Google::Cloud::Error + raise Google::Cloud::Error.from_error(e) + end + end + + job.data max: max end ## + # Creates a new External::DataSource (or subclass) object that + # represents the external data source that can be queried from directly, + # even though the data is not stored in BigQuery. Instead of loading or + # streaming the data, this object references the external data source. + # + # @see https://cloud.google.com/bigquery/external-data-sources Querying + # External Data Sources + # + # @param [String, Array<String>] url The fully-qualified URL(s) that + # point to your data in Google Cloud. An attempt will be made to + # derive the format from the URLs provided. + # @param [String|Symbol] format The data format. This value will be used + # even if the provided URLs are recognized as a different format. + # Optional. + # + # The following values are supported: + # + # * `csv` - CSV + # * `json` - [Newline-delimited JSON](http://jsonlines.org/) + # * `avro` - [Avro](http://avro.apache.org/) + # * `sheets` - Google Sheets + # * `datastore_backup` - Cloud Datastore backup + # * `bigtable` - Bigtable + # + # @return [External::DataSource] External data source. + # + # @example + # require "google/cloud/bigquery" + # + # bigquery = Google::Cloud::Bigquery.new + # + # csv_url = "gs://bucket/path/to/data.csv" + # csv_table = bigquery.external csv_url do |csv| + # csv.autodetect = true + # csv.skip_leading_rows = 1 + # end + # + # data = bigquery.query "SELECT * FROM my_ext_table", + # external: { my_ext_table: csv_table } + # + # data.each do |row| + # puts row[:name] + # end + # + def external url, format: nil + ext = External.from_urls url, format + yield ext if block_given? + ext + end + + ## # Retrieves an existing dataset by ID. # # @param [String] dataset_id The ID of a dataset. # # @return [Google::Cloud::Bigquery::Dataset, nil] Returns `nil` if the @@ -537,10 +675,15 @@ ## # Retrieves the list of datasets belonging to the project. # # @param [Boolean] all Whether to list all datasets, including hidden # ones. The default is `false`. + # @param [String] filter An expression for filtering the results of the + # request by label. The syntax is `labels.<name>[:<value>]`. + # Multiple filters can be `AND`ed together by connecting with a space. + # Example: `labels.department:receiving labels.active`. See [Filtering + # datasets using labels](https://cloud.google.com/bigquery/docs/labeling-datasets#filtering_datasets_using_labels). # @param [String] token A previously-returned page token representing # part of the larger set of results to view. # @param [Integer] max Maximum number of datasets to return. # # @return [Array<Google::Cloud::Bigquery::Dataset>] (See @@ -571,15 +714,15 @@ # datasets = bigquery.datasets # datasets.all do |dataset| # puts dataset.name # end # - def datasets all: nil, token: nil, max: nil + def datasets all: nil, filter: nil, token: nil, max: nil ensure_service! - options = { all: all, token: token, max: max } + options = { all: all, filter: filter, token: token, max: max } gapi = service.list_datasets options - Dataset::List.from_gapi gapi, service, all, max + Dataset::List.from_gapi gapi, service, all, filter, max end ## # Retrieves an existing job by ID. # @@ -755,14 +898,14 @@ ## # Creates a new schema instance. An optional block may be given to # configure the schema, otherwise the schema is returned empty and may # be configured directly. # - # The returned schema can be passed to {Dataset#load} using the `schema` - # option. However, for most use cases, the block yielded by - # {Dataset#load} is a more convenient way to configure the schema for - # the destination table. + # The returned schema can be passed to {Dataset#load} using the + # `schema` option. However, for most use cases, the block yielded by + # {Dataset#load} is a more convenient way to configure the schema + # for the destination table. # # @yield [schema] a block for setting the schema # @yieldparam [Schema] schema the object accepting the schema # # @return [Google::Cloud::Bigquery::Schema] @@ -781,10 +924,10 @@ # end # # dataset = bigquery.dataset "my_dataset" # # gs_url = "gs://my-bucket/file-name.csv" - # load_job = dataset.load "my_new_table", gs_url, schema: schema + # load_job = dataset.load_job "my_new_table", gs_url, schema: schema # def schema s = Schema.from_gapi yield s if block_given? s