# # warning: somewhat scary/explicit! # see subtitle_todo file require_relative 'edl_parser' module SubtitleProfanityFinder # splits into timestamps -> timestamps\ncontent blocks def self.split_to_glops subtitles subtitles.scan(/\d\d:\d\d:\d\d.*?^$/m) end def self.convert_to_regexps profanity_hash all_profanity_combinations = [] profanity_hash.to_a.sort.reverse.each{|profanity, sanitized| as_regexp = Regexp.new(profanity, Regexp::IGNORECASE) if sanitized.is_a? Array is_single_word_profanity = true raise unless sanitized[1] == :full_word raise unless sanitized.length == 2 sanitized = sanitized[0] end permutations = [profanity] if profanity =~ /l/ permutations << profanity.gsub(/l/i, 'i') end if profanity =~ /i/ permutations << profanity.gsub(/i/i, 'l') end bracketized = '[' + sanitized + ']' for permutation in permutations if is_single_word_profanity # oh wow this is ughly... sanitized_version = bracketized as_regexp = Regexp.new("\s" + permutation + "\s", Regexp::IGNORECASE) all_profanity_combinations << [as_regexp, ' ' + bracketized + ' '] as_regexp = Regexp.new("^" + permutation + "\s", Regexp::IGNORECASE) all_profanity_combinations << [as_regexp, bracketized + ' '] as_regexp = Regexp.new("\s" + permutation + "$", Regexp::IGNORECASE) all_profanity_combinations << [as_regexp, ' ' + bracketized] as_regexp = Regexp.new("^" + permutation + "$", Regexp::IGNORECASE) all_profanity_combinations << [as_regexp, bracketized] else all_profanity_combinations << [as_regexp, bracketized] end end } all_profanity_combinations end def self.edl_output incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00" edl_output_from_string File.read(incoming_filename), extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, beginning_srt, beginning_actual_movie, ending_srt, ending_actual end def self.edl_output_from_string subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual subtitles.gsub!("\r\n", "\n") raise if subtract_from_each_beginning_ts < 0 # these have to be positive...in my twisted paradigm raise if add_to_end_each_ts < 0 starting_timestamp_given_srt = EdlParser.translate_string_to_seconds(starting_timestamp_given_srt) starting_timestamp_actual = EdlParser.translate_string_to_seconds(starting_timestamp_actual) ending_srt = EdlParser.translate_string_to_seconds(ending_srt) ending_actual = EdlParser.translate_string_to_seconds ending_actual # accomodate for both styles of rewrite, except it messes up the math, so just leave it separate: # difference = starting_timestamp_given_srt - starting_timestamp_actual # subtract_from_each_beginning_ts += difference # add_to_end_each_ts -= difference # you minus the initial srt time... (given) # ratio = (end actual - init actual/ end given - init given)*(how far you are past the initial srt) plus initial actual multiply_by_this_factor = (ending_actual - starting_timestamp_actual)/(ending_srt - starting_timestamp_given_srt) multiply_proc = proc {|you| ((you - starting_timestamp_given_srt) * multiply_by_this_factor) + starting_timestamp_actual } bad_profanities = {'hell' => ['he..', :full_word], 'g' + 'o' + 100.chr => 'goodness', 'g' + 111.chr + 100.chr + 's' => 'deitys', 'meu deus' => 'lo..', 'lord' => 'lo..', 'da' + 'mn' => 'da..', 'f' + 117.chr + 99.chr + 107.chr => 'f...', 'allah' => 'allah', 'bi' + 'tc' + 104.chr => 'b....', 'bas' + 'ta' + 'r' + 100.chr => 'ba.....', ((arse = 'a' + 's'*2)) => ['a..', :full_word], arse + 'h' + 'ole' => 'a..h...', 'dieu' => ['deity', :full_word], # TODO fails... arse + 'wipe' => 'a..w....', 'jes' + 'u' + 's' => 'j....', 'chri' + 'st'=> ['chr...', :full_word], # allow for christian[ity] [good idea or not?] 'sh' + 'i' + 't' => 'sh..', 'a realllly bad word' => ['test edited bad word', :full_word] } bad_profanities.merge! extra_profanity_hash # LODO make easier to use... semi_bad_profanities = {} ['bloody', 'moron', 'breast', 'idiot', 'sex', 'genital', 'make love', 'making love', 'love mak', 'dumb', 'suck', 'piss'].each{|name| # butter? semi_bad_profanities[name] = name } semi_bad_profanities['crap'] = ['crap', :full_word] semi_bad_profanities['butt'] = ['butt', :full_word] all_profanity_combinationss = [convert_to_regexps(bad_profanities), convert_to_regexps(semi_bad_profanities)] output = '' for all_profanity_combinations in all_profanity_combinationss output += "\n" for glop in split_to_glops(subtitles) for profanity, (sanitized, whole_word) in all_profanity_combinations # dunno if we should force words to just start with this or contain it anywhere... # what about 'g..ly' for example? # or 'un...ly' ? I think we're ok there... if glop =~ profanity # create english-ified version # take out timing line, number line sanitized_glop = glop.lines.to_a[1..-1].join(' ') sanitized_glop.gsub!(/[\r\n]/, '') # flatten 3 lines to 1 sanitized_glop.gsub!(/<(.|)(\/|)i>/i, '') # kill sanitized_glop.gsub!(/[^a-zA-Z0-9'""]/, ' ') # kill weird stuff like ellipses sanitized_glop.gsub!(/\W\W+/, ' ') # remove duplicate " " 's # sanitize the subtitles themselves... for all_profanity_combinations2 in all_profanity_combinationss for (prof2, (sanitized2, whole_word2)) in all_profanity_combinations2 if sanitized_glop =~ prof2 sanitized_glop.gsub!(prof2, sanitized2) end end end # because we have duplicate's for the letter l/i, refactor [[[profanity]]] sanitized_glop.gsub!(/\[+/, '[') sanitized_glop.gsub!(/\]+/, ']') # extract timing info timing_line = glop.split("\n").first.strip timing_line =~ /((\d\d:\d\d:\d\d),(\d\d\d) --> (\d\d:\d\d:\d\d),(\d\d\d))/ # "00:03:00.0" , "00:04:00.0", "violence", "of some sort", ts_begin = "#{$2}.#{$3}" ts_begin = EdlParser.translate_string_to_seconds ts_begin ts_begin -= subtract_from_each_beginning_ts ts_begin = multiply_proc.call(ts_begin) ts_begin = EdlParser.translate_time_to_human_readable ts_begin, true ts_end = "#{$4}.#{$5}" ts_end = EdlParser.translate_string_to_seconds ts_end ts_end += add_to_end_each_ts ts_end = multiply_proc.call(ts_end) ts_end = EdlParser.translate_time_to_human_readable ts_end, true unless output.contain? ts_begin output += %!"#{ts_begin}" , "#{ts_end}", "profanity", "#{sanitized.gsub(/[\[\]]/, '').strip}", "#{sanitized_glop.strip}",\n! end end end end end output end end if $0 == __FILE__ if ARGV.empty? p 'syntax: filename.srt [prof1 sanitized_equivalent1 prof2 sanitized_equivalent2 ...]' exit else print SubtitleProfanityFinder.edl_output ARGV.first end end