lib/virastar.rb in virastar-0.0.3 vs lib/virastar.rb in virastar-0.0.4
- old
+ new
@@ -45,19 +45,10 @@
text.gsub!(/(\S)(ه[\s]+[یي])(\s)/, '\1هٔ\3') if @fix_hamzeh
# remove unnecessary zwnj char that are succeeded/preceded by a space
text.gsub!(/\s+|\s+/,' ') if @cleanup_zwnj
- # should fix spacing for () [] {} “” «»
- if @fix_spacing_for_braces_and_quotes
- text.gsub!(/\s*(\()\s*([^)]+?)\s*?(\))\s*/,' \1\2\3 ')
- text.gsub!(/\s*(\[)\s*([^)]+?)\s*?(\])\s*/,' \1\2\3 ')
- text.gsub!(/\s*(\{)\s*([^)]+?)\s*?(\})\s*/,' \1\2\3 ')
- text.gsub!(/\s*(“)\s*([^)]+?)\s*?(”)\s*/,' \1\2\3 ')
- text.gsub!(/\s*(«)\s*([^)]+?)\s*?(»)\s*/,' \1\2\3 ')
- end
-
# character replacement
persian_numbers = "۱۲۳۴۵۶۷۸۹۰"
arabic_numbers = "١٢٣٤٥٦٧٨٩٠"
english_numbers = "1234567890"
bad_chars = ",;كي%"
@@ -99,15 +90,28 @@
# ----------------------------------------------------------------
# : ; , . ! ? and their persian equivalents should have one space after and no space before
if @fix_spacing_for_braces_and_quotes
text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
+ # do not put space after colon that separates time parts
+ text.gsub!(/([۰-۹]+):\s+([۰-۹]+)/, '\1:\2')
end
+
+
+ # should fix spacing for () [] {} “” «»
+ if @fix_spacing_for_braces_and_quotes
+ text.gsub!(/\s*(\()\s*([^)]+?)\s*?(\))\s*/,' \1\2\3 ')
+ text.gsub!(/\s*(\[)\s*([^)]+?)\s*?(\])\s*/,' \1\2\3 ')
+ text.gsub!(/\s*(\{)\s*([^)]+?)\s*?(\})\s*/,' \1\2\3 ')
+ text.gsub!(/\s*(“)\s*([^)]+?)\s*?(”)\s*/,' \1\2\3 ')
+ text.gsub!(/\s*(«)\s*([^)]+?)\s*?(»)\s*/,' \1\2\3 ')
+ end
+
# should replace more than one space with just a single one
if @cleanup_spacing
text.gsub!(/[ ]+/,' ')
- #text.gsub!(/\s*[\n]+\s*/," \n")
+ text.gsub!(/([\n]+)[ ]*/,'\1')
end
# remove spaces, tabs, and new lines from the beginning and enf of file
text.strip! if @cleanup_begin_and_end
\ No newline at end of file