class String
def self.small_words
@small_words ||= %w(a an and as at but by en for if in of on or the to v[.]? via vs[.]?)
end
def /( path )
::File.join(self, path)
end
def titlecase
swrgxp = self.class.small_words.join('|')
parts = self.split( %r/( [:.;?!][ ] | (?:[ ]|^)["“] )/x )
parts.each do |part|
part.gsub!(%r/\b[[:alpha:]][[:lower:].'’]*\b/) do |s|
s =~ %r/\w+\.\w+/ ? s : s.capitalize
end
# Lowercase the small words
part.gsub!(%r/\b(#{swrgxp})\b/i) {|w| w.downcase}
# If the first word is a small word, then capitalize it
part.gsub!(%r/\A([[:punct:]]*)(#{swrgxp})\b/) {$1 + $2.capitalize}
# If the last word is a small word, then capitalize it
part.gsub!(%r/\b(#{swrgxp})([^\w\s]*)\z/) {$1.capitalize + $2}
end
str = parts.join
# Special cases:
str.gsub!(/ V(s?)\. /, ' v\1. ') # "v." and "vs."
str.gsub!(/(['’])S\b/, '\1s') # 'S (otherwise you get "the SEC'S decision")
str.gsub!(/\b(AT&T|Q&A)\b/i) { |w| w.upcase } # "AT&T" and "Q&A", which get tripped up.
str
end
# Borrowed from the excellent StringEx library: git://github.com/rsl/stringex.git
# Create a URI-friendly representation of the string.
def to_url
remove_formatting.downcase.replace_whitespace("-").collapse("-")
end
# Performs multiple text manipulations. Essentially a shortcut for typing them all. View source
# below to see which methods are run.
def remove_formatting
strip_html_tags.convert_accented_entities.convert_misc_entities.convert_misc_characters.collapse
end
# Removes HTML tags from text. This code is simplified from Tobias Luettke's regular expression
# in Typo[http://typosphere.org].
def strip_html_tags(leave_whitespace = false)
name = /[\w:_-]+/
value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
attr = /(#{name}(\s*=\s*#{value})?)/
rx = /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
(leave_whitespace) ? gsub(rx, "").strip : gsub(rx, "").gsub(/\s+/, " ").strip
end
# Converts HTML entities into the respective non-accented letters. Examples:
#
# "á".convert_accented_entities # => "a"
# "ç".convert_accented_entities # => "c"
# "è".convert_accented_entities # => "e"
# "î".convert_accented_entities # => "i"
# "ø".convert_accented_entities # => "o"
# "ü".convert_accented_entities # => "u"
#
# Note: This does not do any conversion of Unicode/Ascii accented-characters. For that
# functionality please use to_ascii.
def convert_accented_entities
gsub(/&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/, '\1')
end
# Converts HTML entities (taken from common Textile/RedCloth formattings) into plain text formats.
#
# Note: This isn't an attempt at complete conversion of HTML entities, just those most likely
# to be generated by Textile.
def convert_misc_entities
dummy = dup
{
"#822[01]" => "\"",
"#821[67]" => "'",
"#8230" => "...",
"#8211" => "-",
"#8212" => "--",
"#215" => "x",
"gt" => ">",
"lt" => "<",
"(#8482|trade)" => "(tm)",
"(#174|reg)" => "(r)",
"(#169|copy)" => "(c)",
"(#38|amp)" => "and",
"nbsp" => " ",
"(#162|cent)" => " cent",
"(#163|pound)" => " pound",
"(#188|frac14)" => "one fourth",
"(#189|frac12)" => "half",
"(#190|frac34)" => "three fourths",
"(#176|deg)" => " degrees"
}.each do |textiled, normal|
dummy.gsub!(/{textiled};/, normal)
end
dummy.gsub(/&[^;]+;/, "")
end
# Converts various common plaintext characters to a more URI-friendly representation.
# Examples:
#
# "foo & bar".convert_misc_characters # => "foo and bar"
# "Chanel #9".convert_misc_characters # => "Chanel number nine"
# "user@host".convert_misc_characters # => "user at host"
# "google.com".convert_misc_characters # => "google dot com"
# "$10".convert_misc_characters # => "10 dollars"
# "*69".convert_misc_characters # => "star 69"
# "100%".convert_misc_characters # => "100 percent"
# "windows/mac/linux".convert_misc_characters # => "windows slash mac slash linux"
#
# Note: Because this method will convert any & symbols to the string "and",
# you should run any methods which convert HTML entities (convert_html_entities and convert_misc_entities)
# before running this method.
def convert_misc_characters
dummy = dup.gsub(/\.{3,}/, " dot dot dot ") # Catch ellipses before single dot rule!
{
/\s*&\s*/ => "and",
/\s*#/ => "number",
/\s*@\s*/ => "at",
/(\S|^)\.(\S)/ => '\1 dot \2',
/(\s|^)\$(\d*)(\s|$)/ => '\2 dollars',
/\s*\*\s*/ => "star",
/\s*%\s*/ => "percent",
/\s*(\\|\/)\s*/ => "slash",
}.each do |found, replaced|
replaced = " #{replaced} " unless replaced =~ /\\1/
dummy.gsub!(found, replaced)
end
dummy = dummy.gsub(/(^|\w)'(\w|$)/, '\1\2').gsub(/[\.,:;()\[\]\/\?!\^'"_]/, " ")
end
# Replace runs of whitespace in string. Defaults to a single space but any replacement
# string may be specified as an argument. Examples:
#
# "Foo bar".replace_whitespace # => "Foo bar"
# "Foo bar".replace_whitespace("-") # => "Foo-bar"
def replace_whitespace(replace = " ")
gsub(/\s+/, replace)
end
# Removes specified character from the beginning and/or end of the string and then performs
# String#squeeze(character), condensing runs of the character within the string.
#
# Note: This method has been superceded by ActiveSupport's squish method.
def collapse(character = " ")
sub(/^#{character}*/, "").sub(/#{character}*$/, "").squeeze(character)
end
end # class String
# EOF