cloud_speech.rb in google-cloud-speech-0.30.1

- old
+ new

@@ -13,21 +13,10 @@
 # limitations under the License.
 
 module Google
   module Cloud
     module Speech
-      ##
-      # # Cloud Speech API Contents
-      #
-      # | Class | Description |
-      # | ----- | ----------- |
-      # | [SpeechClient][] | Service that implements Google Cloud Speech API. |
-      # | [Data Types][] | Data types for Google::Cloud::Speech::V1p1beta1 |
-      #
-      # [SpeechClient]: https://googlecloudplatform.github.io/google-cloud-ruby/#/docs/google-cloud-speech/latest/google/cloud/speech/v1p1beta1/speechclient
-      # [Data Types]: https://googlecloudplatform.github.io/google-cloud-ruby/#/docs/google-cloud-speech/latest/google/cloud/speech/v1p1beta1/datatypes
-      #
       module V1p1beta1
         # The top-level message sent by the client for the +Recognize+ method.
         # @!attribute [rw] config
         #   @return [Google::Cloud::Speech::V1p1beta1::RecognitionConfig]
         #     *Required* Provides information to the recognizer that specifies how to
@@ -111,17 +100,50 @@
         #     16000 is optimal. For best results, set the sampling rate of the audio
         #     source to 16000 Hz. If that's not possible, use the native sample rate of
         #     the audio source (instead of re-sampling).
         #     This field is optional for +FLAC+ and +WAV+ audio files and required
         #     for all other audio formats. For details, see {Google::Cloud::Speech::V1p1beta1::RecognitionConfig::AudioEncoding AudioEncoding}.
+        # @!attribute [rw] audio_channel_count
+        #   @return [Integer]
+        #     *Optional* The number of channels in the input audio data.
+        #     ONLY set this for MULTI-CHANNEL recognition.
+        #     Valid values for LINEAR16 and FLAC are +1+-+8+.
+        #     Valid values for OGG_OPUS are '1'-'254'.
+        #     Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only +1+.
+        #     If +0+ or omitted, defaults to one channel (mono).
+        #     NOTE: We only recognize the first channel by default.
+        #     To perform independent recognition on each channel set
+        #     enable_separate_recognition_per_channel to 'true'.
+        # @!attribute [rw] enable_separate_recognition_per_channel
+        #   @return [true, false]
+        #     This needs to be set to ‘true’ explicitly and audio_channel_count > 1
+        #     to get each channel recognized separately. The recognition result will
+        #     contain a channel_tag field to state which channel that result belongs to.
+        #     If this is not ‘true’, we will only recognize the first channel.
+        #     NOTE: The request is also billed cumulatively for all channels recognized:
+        #         (audio_channel_count times the audio length)
         # @!attribute [rw] language_code
         #   @return [String]
         #     *Required* The language of the supplied audio as a
         #     [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
         #     Example: "en-US".
         #     See [Language Support](https://cloud.google.com/speech/docs/languages)
         #     for a list of the currently supported language codes.
+        # @!attribute [rw] alternative_language_codes
+        #   @return [Array<String>]
+        #     *Optional* A list of up to 3 additional
+        #     [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
+        #     listing possible alternative languages of the supplied audio.
+        #     See [Language Support](https://cloud.google.com/speech/docs/languages)
+        #     for a list of the currently supported language codes.
+        #     If alternative languages are listed, recognition result will contain
+        #     recognition in the most likely language detected including the main
+        #     language_code. The recognition result will include the language tag
+        #     of the language detected in the audio.
+        #     NOTE: This feature is only supported for Voice Command and Voice Search
+        #     use cases and performance may vary for other use cases (e.g., phone call
+        #     transcription).
         # @!attribute [rw] max_alternatives
         #   @return [Integer]
         #     *Optional* Maximum number of recognition hypotheses to be returned.
         #     Specifically, the maximum number of +SpeechRecognitionAlternative+ messages
         #     within each +SpeechRecognitionResult+.
@@ -141,19 +163,39 @@
         #   @return [true, false]
         #     *Optional* If +true+, the top result includes a list of words and
         #     the start and end time offsets (timestamps) for those words. If
         #     +false+, no word-level time offset information is returned. The default is
         #     +false+.
+        # @!attribute [rw] enable_word_confidence
+        #   @return [true, false]
+        #     *Optional* If +true+, the top result includes a list of words and the
+        #     confidence for those words. If +false+, no word-level confidence
+        #     information is returned. The default is +false+.
         # @!attribute [rw] enable_automatic_punctuation
         #   @return [true, false]
         #     *Optional* If 'true', adds punctuation to recognition result hypotheses.
         #     This feature is only available in select languages. Setting this for
         #     requests in other languages has no effect at all.
         #     The default 'false' value does not add punctuation to result hypotheses.
         #     NOTE: "This is currently offered as an experimental service, complimentary
         #     to all users. In the future this may be exclusively available as a
         #     premium feature."
+        # @!attribute [rw] enable_speaker_diarization
+        #   @return [true, false]
+        #     *Optional* If 'true', enables speaker detection for each recognized word in
+        #     the top alternative of the recognition result using a speaker_tag provided
+        #     in the WordInfo.
+        #     Note: When this is true, we send all the words from the beginning of the
+        #     audio for the top alternative in every consecutive responses.
+        #     This is done in order to improve our speaker tags as our models learn to
+        #     identify the speakers in the conversation over time.
+        # @!attribute [rw] diarization_speaker_count
+        #   @return [Integer]
+        #     *Optional*
+        #     If set, specifies the estimated number of speakers in the conversation.
+        #     If not set, defaults to '2'.
+        #     Ignored unless enable_speaker_diarization is set to true."
         # @!attribute [rw] metadata
         #   @return [Google::Cloud::Speech::V1p1beta1::RecognitionMetadata]
         #     *Optional* Metadata regarding this request.
         # @!attribute [rw] model
         #   @return [String]
@@ -565,19 +607,41 @@
         #     Output only. An estimate of the likelihood that the recognizer will not
         #     change its guess about this interim result. Values range from 0.0
         #     (completely unstable) to 1.0 (completely stable).
         #     This field is only provided for interim results (+is_final=false+).
         #     The default of 0.0 is a sentinel value indicating +stability+ was not set.
+        # @!attribute [rw] channel_tag
+        #   @return [Integer]
+        #     For multi-channel audio, this is the channel number corresponding to the
+        #     recognized result for the audio from that channel.
+        #     For audio_channel_count = N, its output values can range from '1' to 'N'.
+        # @!attribute [rw] language_code
+        #   @return [String]
+        #     Output only. The
+        #     [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+        #     language in this result. This language code was detected to have the most
+        #     likelihood of being spoken in the audio.
         class StreamingRecognitionResult; end
 
         # A speech recognition result corresponding to a portion of the audio.
         # @!attribute [rw] alternatives
         #   @return [Array<Google::Cloud::Speech::V1p1beta1::SpeechRecognitionAlternative>]
         #     Output only. May contain one or more recognition hypotheses (up to the
         #     maximum specified in +max_alternatives+).
         #     These alternatives are ordered in terms of accuracy, with the top (first)
         #     alternative being the most probable, as ranked by the recognizer.
+        # @!attribute [rw] channel_tag
+        #   @return [Integer]
+        #     For multi-channel audio, this is the channel number corresponding to the
+        #     recognized result for the audio from that channel.
+        #     For audio_channel_count = N, its output values can range from '1' to 'N'.
+        # @!attribute [rw] language_code
+        #   @return [String]
+        #     Output only. The
+        #     [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+        #     language in this result. This language code was detected to have the most
+        #     likelihood of being spoken in the audio.
         class SpeechRecognitionResult; end
 
         # Alternative hypotheses (a.k.a. n-best list).
         # @!attribute [rw] transcript
         #   @return [String]
@@ -616,9 +680,25 @@
         #     This is an experimental feature and the accuracy of the time offset can
         #     vary.
         # @!attribute [rw] word
         #   @return [String]
         #     Output only. The word corresponding to this set of information.
+        # @!attribute [rw] confidence
+        #   @return [Float]
+        #     Output only. The confidence estimate between 0.0 and 1.0. A higher number
+        #     indicates an estimated greater likelihood that the recognized words are
+        #     correct. This field is set only for the top alternative of a non-streaming
+        #     result or, of a streaming result where +is_final=true+.
+        #     This field is not guaranteed to be accurate and users should not rely on it
+        #     to be always provided.
+        #     The default of 0.0 is a sentinel value indicating +confidence+ was not set.
+        # @!attribute [rw] speaker_tag
+        #   @return [Integer]
+        #     Output only. A distinct integer value is assigned for every speaker within
+        #     the audio. This field specifies which one of those speakers was detected to
+        #     have spoken this word. Value ranges from '1' to diarization_speaker_count.
+        #     speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+        #     top alternative.
         class WordInfo; end
       end
     end
   end
 end
\ No newline at end of file