# encoding: UTF-8 module Stringex module Localization module ConversionExpressions ABBREVIATION = /(\s|\(|^)([[:alpha:]](\.[[:alpha:]])+(\.?)[[:alpha:]]*(\s|\)|$))/ ACCENTED_HTML_ENTITY = /&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/ APOSTROPHE = /(^|[[:alpha:]])'|`([[:alpha:]]|$)/ CHARACTERS = { and: /\s*&\s*/, at: /\s*@\s*/, degrees: /\s*°\s*/, divide: /\s*÷\s*/, dot: /(\S|^)\.(\S)/, ellipsis: /\s*\.{3,}\s*/, equals: /\s*=\s*/, number: /\s*#/, percent: /\s*%\s*/, plus: /\s*\+\s*/, slash: /\s*(\\|\/|/)\s*/, star: /\s*\*\s*/, } # Things that just get converted to spaces CLEANUP_CHARACTERS = /[\.,:;(){}\[\]\?!\^'ʼ"_\|<>]/ CLEANUP_HTML_ENTITIES = /&[^;]+;/ CURRENCIES_SUPPORTED_SIMPLE = { generic: /¤/, dollars: /\$/, euros: /€/, pounds: /£/, yen: /¥/, reais: /R\$/ } CURRENCIES_SUPPORTED_COMPLEX = { dollars: :dollars_cents, euros: :euros_cents, pounds: :pounds_pence, reais: :reais_cents } CURRENCIES_SUPPORTED = Regexp.new(CURRENCIES_SUPPORTED_SIMPLE.values.join('|')) CURRENCIES_SIMPLE = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content| key, expression = content hash[key] = /(?:\s|^)#{expression}(\d*)(?:\s|$)/ hash end CURRENCIES_COMPLEX = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content| key, expression = content # Do we really need to not worry about complex currencies if there are none for the currency? complex_key = CURRENCIES_SUPPORTED_COMPLEX[key] if complex_key hash[complex_key] = /(?:\s|^)#{expression}(\d+)\.(\d+)(?:\s|$)/ end hash end CURRENCIES = CURRENCIES_SIMPLE.merge(CURRENCIES_COMPLEX) HTML_ENTITIES = Proc.new(){ base = { amp: %w{#38 amp}, cent: %w{#162 cent}, copy: %w{#169 copy}, deg: %w{#176 deg}, divide: %w{#247 divide}, double_quote: %w{#34 #822[012] quot ldquo rdquo dbquo}, ellipsis: %w{#8230 hellip}, en_dash: %w{#8211 ndash}, em_dash: %w{#8212 mdash}, frac14: %w{#188 frac14}, frac12: %w{#189 frac12}, frac34: %w{#190 frac34}, gt: %w{#62 gt}, lt: %w{#60 lt}, nbsp: %w{#160 nbsp}, pound: %w{#163 pound}, reg: %w{#174 reg}, single_quote: %w{#39 #821[678] apos lsquo rsquo sbquo}, times: %w{#215 times}, trade: %w{#8482 trade}, yen: %w{#165 yen}, } base.inject({}) do |hash, content| key, expression = content hash[key] = /&(#{expression.join('|')});/ hash end }.call HTML_TAG = Proc.new(){ name = /[\w:_-]+/ value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/ attr = /(#{name}(\s*=\s*#{value})?)/ /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/ }.call SMART_PUNCTUATION = { /(“|”|\302\223|\302\224|\303\222|\303\223)/ => '"', /(‘|’|\302\221|\302\222|\303\225)/ => "'", /…/ => "...", } UNREADABLE_CONTROL_CHARACTERS = /[[:cntrl:]]/ # Ordered by denominator then numerator of the value VULGAR_FRACTIONS = { half: /(½|½|½)/, one_third: /(⅓|⅓)/, two_thirds: /(⅔|⅔)/, one_fourth: /(¼|¼|¼)/, three_fourths: /(¾|¾|¾)/, one_fifth: /(⅕|⅕)/, two_fifths: /(⅖|⅖)/, three_fifths: /(⅗|⅗)/, four_fifths: /(⅘|⅘)/, one_sixth: /(⅙|⅙)/, five_sixths: /(⅚|⅚)/, one_eighth: /(⅛|⅛)/, three_eighths: /(⅜|⅜)/, five_eighths: /(⅝|⅝)/, seven_eighths: /(⅞|⅞)/, } WHITESPACE = /\s+/ class << self %w{ abbreviation accented_html_entity apostrophe characters cleanup_characters cleanup_html_entities currencies currencies_simple currencies_complex html_entities html_tag smart_punctuation unreadable_control_characters vulgar_fractions whitespace }.each do |conversion_type| define_method conversion_type do const_get conversion_type.upcase end end end end end end