#!/usr/bin/env ruby # -*- coding: utf-8 -*- require File.expand_path(File.dirname(__FILE__) + '/spec_helper') require 'wp2txt' require 'wp2txt/article' require 'wp2txt/utils' describe "Wp2txt" do it "contains mediawiki-format related functions:" do end include Wp2txt before do end describe "process_nested_structure" do it "parse nested structure replacing str in the format specified" do str_before = "[[ab[[cde[[alfa]]]]fg]]" str_after = "<>>>fg>>" scanner = StringScanner.new(str_before) str_processed = process_nested_structure(scanner, "[[", "]]") do |content| "<<" + content + ">>" end expect(str_processed).to eq str_after str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}" str_after = "#* <> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>" scanner = StringScanner.new(str_before) str_processed = process_nested_structure(scanner, "{{", "}}") do |content| "<<" + content + ">>" end #str_processed.should == str_after expect(str_processed).to eq str_after end end describe "special_chr" do it "replaces character references with real characters" do str_before = "  < > & "" str_after = " < > & \"" expect(special_chr(str_before)).to eq str_after end end describe "chrref_to_utf" do it "replaces character references with real characters" do str_before = "♪" str_after = "♪" expect(chrref_to_utf(str_before)).to eq str_after end end describe "mndash" do it "replaces {mdash}, {ndash}, or {–} with '–'" do str_before = "{mdash} {ndash} {–}" str_after = "– – –" expect(mndash(str_before)).to eq str_after end end describe "format_ref" do it "replaces \\r\\n and
inside [ref] ... [/ref] to ' '" do str_before = "[ref]...\r\n...
...[/ref]" str_after = "... ... ..." expect(format_ref(str_before)).to eq str_after end end describe "make_reference" do it "replaces tag with [ref]" do str_before = " ...
...
\n " str_after = "[ref] ... \n ... [/ref] \n " expect(make_reference(str_before)).to eq str_after end end describe "remove_table" do it "removes table formated parts" do str_before = "{| ... \n{| ... \n ...|}\n ...|}" str_after = "" expect(remove_table(str_before)).to eq str_after end end describe "remove_clade" do it "removes clade formated parts" do str_before = "\{\{clade ... \n ... \n ... \n\}\}" str_after = "" expect(remove_clade(str_before)).to eq str_after end end describe "remove_hr" do it "removes horizontal lines" do str_before = "\n----\n--\n--\n" str_after = "\n\n" expect(remove_hr(str_before)).to eq str_after end end describe "remove_tag" do it "removes tags" do str_before = "abc" str_after = "abc" expect(remove_tag(str_before)).to eq str_after str_before = "[tag]def[/tag]" str_after = "def" expect(remove_tag(str_before, ['[', ']'])).to eq str_after end end describe "remove_directive" do it "removes directive" do str_before = "__abc__\n __def__" str_after = "\n " expect(remove_directive(str_before)).to eq str_after end end describe "remove_emphasis" do it "removes directive" do str_before = "''abc''\n'''def'''" str_after = "abc\ndef" expect(remove_emphasis(str_before)).to eq str_after end end describe "escape_nowiki" do it "replaces ... with " do str_before = "[[abc]]def[[ghi]]" str_after = Regexp.new("def") expect(escape_nowiki(str_before)).to match str_after end end describe "unescape_nowiki" do it "replaces with string stored elsewhere" do @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"} str_before = "def" str_after = "[[abc]]def[[ghi]]" expect(unescape_nowiki(str_before)).to eq str_after end end describe "process_interwiki_links" do it "formats text link and remove brackets" do expect(process_interwiki_links("[[a b]]")).to eq "a b" expect(process_interwiki_links("[[a b|c]]")).to eq "c" expect(process_interwiki_links("[[a|b|c]]")).to eq "b|c" expect(process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]")).to eq "[ɲ], /J/" end end describe "process_external_links" do it "formats text link and remove brackets" do expect(process_external_links("[http://yohasebe.com yohasebe.com]")).to eq "yohasebe.com" expect(process_external_links("[http://yohasebe.com]")).to eq "http://yohasebe.com" expect(process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}")).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}" end end describe "process_template" do it "removes brackets and leaving some text" do str_before = "{{}}" str_after = "" expect(process_template(str_before)).to eq str_after str_before = "{{lang|en|Japan}}" str_after = "Japan" expect(process_template(str_before)).to eq str_after str_before = "{{a|b=c|d=f}}" str_after = "a" expect(process_template(str_before)).to eq str_after str_before = "{{a|b|{{c|d|e}}}}" str_after = "e" expect(process_template(str_before)).to eq str_after end end # describe "expand_template" do # it "gets data corresponding to a given template using mediawiki api" do # uri = "http://en.wiktionary.org/w/api.php" # template = "{{en-verb}}" # word = "kick" # expanded = expand_template(uri, template, word) # html =<kick (''third-person singular simple present'' '''[[kicks#English|kicks]]''', ''present participle'' '''[[kicking#English|kicking]]''', ''simple past and past participle'' '''[[kicked#English|kicked]]''')[[Category:English verbs|kick]] # EOD # html.strip! # expanded.should == html # end # end end