lib/subber/parser/vtt.rb in subber-0.1.3 vs lib/subber/parser/vtt.rb in subber-0.1.4
- old
+ new
@@ -1,70 +1,75 @@
module Subber::Parser
class Vtt < Base
- SUBTITLE_REGEX = /([^\n]*)\n([^\n]*)(\n(.*))?/m
+ SUBTITLE_REGEX = /(\d*)\n?(^\d{0,2}:?\d{2}:\d{2}\.\d{3}\s-->\s\d{0,2}:?\d{2}:\d{2}\.\d{3}$)\n?(.*)/m
COUNTER_REGEX = /\d+/
- TIME_RANGE_REGEX = /(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})/
- TIMECODE_REGEX = /(\d{2}):(\d{2}):(\d{2})\.(\d{3})/
+ TIME_RANGE_REGEX = /(^\d{0,2}:?\d{2}:\d{2}\.\d{3})\s-->\s(\d{0,2}:?\d{2}:\d{2}\.\d{3}$)/
+ TIMECODE_REGEX = /(^\d{0,2}):?(\d{2}):(\d{2})\.(\d{3})/
- DELIMITER_REGEX = /\n?\n\n/
+ CUE_DELIMITER_REGEX = /\n\n/
WINDOW_LINE_BREAK_REGEX = /\r/
- WEBVTT_HEADER_REGEX = /WEBVTT\n\n/
BYTE_ORDER_MARK_STRING = "\xEF\xBB\xBF"
+ INVALID_CUE_START_STRINGS = %w(WEBVTT NOTE STYLE REGION)
class << self
# @param file_content [String]
# @return [Array<Subber::Subtitle>]
#
def parse(file_content)
file_content = remove_window_line_break(file_content)
- file_content = remove_webvtt_header(file_content)
+ cues = extract_cues(file_content)
- subtitle_texts = file_content.split(DELIMITER_REGEX)
- subtitle_texts.map do |subtitle_text|
- convert_text_to_subtitle(subtitle_text)
+ cues.map.with_index do |cue, index|
+ convert_cue_to_subtitle(cue, index)
end
end
private
# @param file_content [String]
- # @return [String]
+ # @return [Array<String>]
#
- def remove_webvtt_header(file_content)
- file_content.sub(WEBVTT_HEADER_REGEX, '')
+ def extract_cues(file_content)
+ cues = file_content.split(CUE_DELIMITER_REGEX)
+ cues.reject do |cue|
+ cue.start_with?(*INVALID_CUE_START_STRINGS)
+ end
end
- # @param file_content [String]
- # @return [String]
+ # @param cue [String]
+ # @param index [Integer]
+ # @return [Array<Subber::Subtitle>]
#
- def remove_window_line_break(file_content)
- file_content.gsub(WINDOW_LINE_BREAK_REGEX, '')
- end
+ def convert_cue_to_subtitle(cue, index)
+ matches = cue.match(SUBTITLE_REGEX).to_a
+ raise(Subber::Errors::InvalidVttFormat, cue) if matches.empty?
- # @param subtitle_text [String]
- # @return [Subber::Subtitle]
- #
- def convert_text_to_subtitle(subtitle_text)
- matches = subtitle_text.match(SUBTITLE_REGEX).to_a
- raise(Subber::Errors::InvalidSrtFormat, subtitle_text) if matches.empty?
+ _cue, counter, time_range_string, content = matches
- _subtitle_text, counter, time_range_string, _new_line, content = matches
+ counter = (index + 1).to_s if counter.empty?
counter = extract_counter(counter)
from, to = extract_time_range(time_range_string)
Subber::Subtitle.new(
counter: counter,
start_time: convert_time_to_ms(from),
end_time: convert_time_to_ms(to),
- content: content
+ content: content.strip
)
rescue Subber::Errors::InvalidCounter
- raise(Subber::Errors::InvalidCounter, subtitle_text)
+ raise(Subber::Errors::InvalidCounter, cue)
rescue Subber::Errors::InvalidTimeRange
- raise(Subber::Errors::InvalidTimeRange, subtitle_text)
+ raise(Subber::Errors::InvalidTimeRange, cue)
rescue Subber::Errors::InvalidTimestamp
- raise(Subber::Errors::InvalidTimestamp, subtitle_text)
+ raise(Subber::Errors::InvalidTimestamp, cue)
+ end
+
+ # @param file_content [String]
+ # @return [String]
+ #
+ def remove_window_line_break(file_content)
+ file_content.gsub(WINDOW_LINE_BREAK_REGEX, '')
end
# @param counter_string [String]
# @return [Integer]
# @raise [Subber::Errors::InvalidCounter]