Sha256: aa673f4c37e5159c99292d14a17633b46b911cc220a1c616d17b6183cb6c1c43
Contents?: true
Size: 1.88 KB
Versions: 5
Compression:
Stored size: 1.88 KB
Contents
# frozen_string_literal: true require_relative "base" require "open-uri" require "pdf-reader" require "json" require "nokogiri" module JpLocalGov module Data class Importer < Base private URL_DOMAIN = "https://www.soumu.go.jp" SEARCH_URL = "https://www.soumu.go.jp/denshijiti/code.html" VALID_COLUMN_COUNT = 5 def retrieve local_governments = [] PDF::Reader.new(OpenURI.open_uri(pdf_url)).pages.each do |page| page.text.split("\n").each do |row| items = row.split("\s") next if header?(items) || items.length != VALID_COLUMN_COUNT local_governments << to_hash(items) end end local_governments.uniq # to exclude special areas header(ex: Hokkaido Sapporo) end def to_hash(items) { code: items[0], prefecture_code: items[0][0..1], prefecture: items[1], prefecture_kana: covert_half_char_to_full_char(items[3]), city: items[2], city_kana: covert_half_char_to_full_char(items[4]), prefecture_capital: prefecture_capital?(items[1], items[2]) ? 1 : 0 } end def pdf_url html = Nokogiri::HTML(OpenURI.open_uri(SEARCH_URL)) url = html.css('[href$=".pdf"]').first.attributes["href"] "#{URL_DOMAIN}#{url}" end def header?(row) (row[0] =~ /[0-9]/).nil? end def covert_half_char_to_full_char(text) text.gsub(/[\uFF61-\uFF9F]+/) { |str| str.unicode_normalize(:nfkc) } end def prefecture_capital?(prefecture, city) prefecture_capital_list = JSON.parse(File.open(File.expand_path("prefecture_capital.json", __dir__)).read) prefecture_capital_list.any? do |prefecture_capital| prefecture_capital["prefecture"] == prefecture && prefecture_capital["city"] == city end end end end end
Version data entries
5 entries across 5 versions & 1 rubygems
Version | Path |
---|---|
jp_local_gov-1.0.0 | data/importer.rb |
jp_local_gov-0.3.1 | data/importer.rb |
jp_local_gov-0.3.0 | data/importer.rb |
jp_local_gov-0.2.1 | data/importer.rb |
jp_local_gov-0.2.0 | data/importer.rb |