lib/virastar.rb in virastar-0.0.4 vs lib/virastar.rb in virastar-0.0.5
- old
+ new
@@ -27,10 +27,20 @@
@cleanup_begin_and_end = options[:cleanup_begin_and_end] || true
end
def cleanup
text = @text
+
+ # removing URLS bringing them back at the end of process
+ urls = []
+ i = 0
+ text.gsub!(/https?:\/\/([-\w\.]+)+(:\d+)?(\/([\w\/_\.]*(\?\S+)?)?)?/) do |s|
+ urls[i] = s.dup
+ i += 1
+ "__urls__#{i}__"
+ end
+
# replace double dash to ndash and triple dash to mdash
if @fix_dashes
text.gsub!(/-{3}/,'—')
text.gsub!(/-{2}/,'–')
end
@@ -87,35 +97,47 @@
text.gsub!(/ـ+/,"") if @cleanup_kashidas
end
# ----------------------------------------------------------------
+ # should fix outside and inside spacing for () [] {} “” «»
+ if @fix_spacing_for_braces_and_quotes
+ text.gsub!(/[ ]*(\()\s*([^)]+?)\s*?(\))[ ]*/,' \1\2\3 ')
+ text.gsub!(/[ ]*(\[)\s*([^)]+?)\s*?(\])[ ]*/,' \1\2\3 ')
+ text.gsub!(/[ ]*(\{)\s*([^)]+?)\s*?(\})[ ]*/,' \1\2\3 ')
+ text.gsub!(/[ ]*(“)\s*([^)]+?)\s*?(”)[ ]*/,' \1\2\3 ')
+ text.gsub!(/[ ]*(«)\s*([^)]+?)\s*?(»)[ ]*/,' \1\2\3 ')
+ end
+
# : ; , . ! ? and their persian equivalents should have one space after and no space before
if @fix_spacing_for_braces_and_quotes
- text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
+ text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
# do not put space after colon that separates time parts
text.gsub!(/([۰-۹]+):\s+([۰-۹]+)/, '\1:\2')
end
-
-
- # should fix spacing for () [] {} “” «»
+ # should fix inside spacing for () [] {} “” «»
if @fix_spacing_for_braces_and_quotes
- text.gsub!(/\s*(\()\s*([^)]+?)\s*?(\))\s*/,' \1\2\3 ')
- text.gsub!(/\s*(\[)\s*([^)]+?)\s*?(\])\s*/,' \1\2\3 ')
- text.gsub!(/\s*(\{)\s*([^)]+?)\s*?(\})\s*/,' \1\2\3 ')
- text.gsub!(/\s*(“)\s*([^)]+?)\s*?(”)\s*/,' \1\2\3 ')
- text.gsub!(/\s*(«)\s*([^)]+?)\s*?(»)\s*/,' \1\2\3 ')
+ text.gsub!(/(\()\s*([^)]+?)\s*?(\))/,'\1\2\3')
+ text.gsub!(/(\[)\s*([^)]+?)\s*?(\])/,'\1\2\3')
+ text.gsub!(/(\{)\s*([^)]+?)\s*?(\})/,'\1\2\3')
+ text.gsub!(/(“)\s*([^)]+?)\s*?(”)/,'\1\2\3')
+ text.gsub!(/(«)\s*([^)]+?)\s*?(»)/,'\1\2\3')
end
# should replace more than one space with just a single one
if @cleanup_spacing
text.gsub!(/[ ]+/,' ')
- text.gsub!(/([\n]+)[ ]*/,'\1')
+ text.gsub!(/([\n]+)[ ]*/,'\1')
end
# remove spaces, tabs, and new lines from the beginning and enf of file
text.strip! if @cleanup_begin_and_end
+
+ # bringing back urls
+ text.gsub!(/__urls__\d+__/) do |s|
+ urls[s.split("__").last.to_i - 1]
+ end
text
end
end
\ No newline at end of file