lib/zypper/onlinesearch/data.rb in zypper-onlinesearch-1.0.0 vs lib/zypper/onlinesearch/data.rb in zypper-onlinesearch-1.1.0

- old
+ new

@@ -1,52 +1,56 @@ -require 'nokogiri' +# frozen_string_literal: true +require "nokogiri" + module Zypper module Onlinesearch - + # + # Base class for page scraping. + # class PageData - FORMATS = { - 'aarch64': 'ARM v8.x 64-bit', - 'aarch64_ilp32': 'ARM v8.x 64-bit ilp32 mode', - 'all': 'All', - 'armv6l': 'ARM v6', - 'armv7l': 'ARM v7', - 'extra': 'Extra', - 'i586': 'Intel 32-bit', - 'i686': 'Intel Pentium 32-bit', - 'lang': 'Language', - 'lsrc': 'Language source', - 'noarch': 'No architecture', - 'ppc64le': 'PowerPC 64-bit little-endian', - 'ppc64': 'PowerPC 64-bit', - 'ppc': 'PowerPC', - 'repo': 'Repository', - 'riscv64': 'Risc v64', - 's390x': 'IBM System/390', - 'src': 'Source', - 'x86_64': 'Intel/AMD 64-bit', - 'ymp': '1 Click Install', - } + aarch64: "ARM v8.x 64-bit", + aarch64_ilp32: "ARM v8.x 64-bit ilp32 mode", + all: "All", + armv6l: "ARM v6", + armv7l: "ARM v7", + extra: "Extra", + i586: "Intel 32-bit", + i686: "Intel Pentium 32-bit", + lang: "Language", + lsrc: "Language source", + noarch: "No architecture", + ppc64le: "PowerPC 64-bit little-endian", + ppc64: "PowerPC 64-bit", + ppc: "PowerPC", + repo: "Repository", + riscv64: "Risc v64", + s390x: "IBM System/390", + src: "Source", + x86_64: "Intel/AMD 64-bit", + ymp: "1 Click Install" + }.freeze def initialize(page) @page = Nokogiri::HTML(page) end def expand_link(link) - link = [self.class::URL, link].join('/') unless link =~ /:\/\// - URI(link).to_s.gsub(/([^:])\/\//, '\1/') + link = [self.class::URL, link].join("/") unless link =~ %r{://} + URI(link).to_s.gsub(%r{([^:])//}, '\1/') end end - module Data - module Search + # + # Scraping class for openSUSE search. + # class Opensuse < PageData + URL = "https://software.opensuse.org" - URL = 'https://software.opensuse.org' XPATH_CARDS = '//div[@id="search-result-list"]//div[@class="card-body"]' XPATH_NAME = './/h4[@class="card-title"]' XPATH_DESC = './/p[@class="card-text"]' XPATH_URL = './/h4[@class="card-title"]/a/@href' @@ -57,43 +61,47 @@ cards = @page.xpath(XPATH_CARDS) cards.each do |c| url = expand_link(c.xpath(XPATH_URL).text) name = c.xpath(XPATH_NAME).text - name = (File.basename(url) == name) ? name : (File.basename(url) + ' (' + name + ')') - res << { name: name, description: c.xpath(XPATH_DESC).text.strip.gsub(/\n|\ +/, ' '), url: url } + name = File.basename(url) == name ? name : "#{File.basename(url)} (#{name})" + res << { name: name, description: c.xpath(XPATH_DESC).text.strip.gsub(/\n|\ +/, " "), url: url } end - if res.empty? - if @page.xpath(XPATH_ERROR).text.empty? - name = @page.xpath(Page::Opensuse::XPATH_NAME).text - - unless name.to_s.empty? - res << { name: name, description: @page.xpath(Page::Opensuse::XPATH_SHORTDESC).text.strip } - end + if res.empty? && @page.xpath(XPATH_ERROR).text.empty? + name = @page.xpath(Page::Opensuse::XPATH_NAME).text + unless name.to_s.empty? + res << { name: name, description: @page.xpath(Page::Opensuse::XPATH_SHORTDESC).text.strip } end end res end end + # + # Scraping class for Packman search. + # class Packman < PageData + URL = "http://packman.links2linux.org" - URL = 'http://packman.links2linux.org' - XPATH_PACKAGE = '//table[@id="packagelist"]//tr' XPATH_NAME = './/td[@class="package-name"]/a' XPATH_DESC = './/td[@class="package-descr"]' XPATH_URL = './/td[@class="package-name"]/a/@href' def data res = [] - @page.xpath(XPATH_PACKAGE).each do |pack| name = pack.xpath(XPATH_NAME).text - res << { name: name, description: pack.xpath(XPATH_DESC).text.strip.gsub(/\n|\ +/,' '), url: expand_link(pack.xpath(XPATH_URL).text) } unless name.empty? + next if name.empty? + + res << { + name: name, + description: pack.xpath(XPATH_DESC).text.strip.gsub(/\n|\ +/, " "), + url: expand_link(pack.xpath(XPATH_URL).text) + } end if res.empty? name = @page.xpath(Page::Packman::XPATH_NAME).text @@ -102,42 +110,49 @@ end end res end - end end - module Page - + # + # Scraping class for openSUSE page. + # class Opensuse < PageData + URL = "https://software.opensuse.org" - URL = 'https://software.opensuse.org' - XPATH_NAME = '//h1' - XPATH_SHORTDESC = '//h1/following::p/strong' + XPATH_NAME = "//h1" + XPATH_SHORTDESC = "//h1/following::p/strong" XPATH_DESC = '//*[@id="pkg-desc"]' XPATH_SUPPORTED = '//div[@id="other-distributions-listing"]/h4' - XPATH_SUPPORTED_DISTRO = './h4' + XPATH_SUPPORTED_DISTRO = "./h4" XPATH_SUPPORTED_LABEL = './/following-sibling::div[@class="card mb-2"][1]//a' - XPATH_SUPPORTED_LINK = './/@href' - XPATH_SUPPORTED_VERSION = '../..//div[@class="col-md-2"]' + XPATH_SUPPORTED_LINK = ".//@href" + XPATH_SUPPORTED_VERSION = '../..//div[@class="col-md-2"]' XPATH_COMMUNITY = './/following-sibling::div[contains(@id,"community-packages")][1]//div/div/a' - XPATH_COMMUNITY_LINK = './/@href' - XPATH_COMMUNITY_VERSION = '../..//div[@class="col-md-2"]' + XPATH_COMMUNITY_LINK = ".//@href" + XPATH_COMMUNITY_VERSION = '../..//div[@class="col-md-2"]' XPATH_EXPERIMENTAL = './/following-sibling::div[contains(@id,"experimental-packages")][1]//div/div/a' - XPATH_EXPERIMENTAL_LINK = './/@href' + XPATH_EXPERIMENTAL_LINK = ".//@href" XPATH_EXPERIMENTAL_VERSION = '../..//div[@class="col-md-2"]' + XPATH_UNSUPPORTED = '//div[@id="unsupported-distributions"]/h4' + + XPATH_UNSUPPORTED_DISTRO = "./h4" + XPATH_UNSUPPORTED_LABEL = + './/following-sibling::div[@class="card mb-2" and count(preceding-sibling::h4)=_n_]//a' + XPATH_UNSUPPORTED_LINK = ".//@href" + XPATH_UNSUPPORTED_VERSION = '../..//div[@class="col-md-2"]' + def data res = {} - res[:name] = @page.xpath(XPATH_NAME).text res[:short_description] = @page.xpath(XPATH_SHORTDESC).text.strip res[:description] = @page.xpath(XPATH_DESC).text.chomp res[:versions] = [] @@ -145,21 +160,27 @@ extract(ver, res, :supported, XPATH_SUPPORTED_LABEL, XPATH_SUPPORTED_VERSION, XPATH_SUPPORTED_LINK) extract(ver, res, :community, XPATH_COMMUNITY, XPATH_COMMUNITY_VERSION, XPATH_COMMUNITY_LINK) extract(ver, res, :experimental, XPATH_EXPERIMENTAL, XPATH_EXPERIMENTAL_VERSION, XPATH_EXPERIMENTAL_LINK) end + @page.xpath(XPATH_UNSUPPORTED).each_with_index do |ver, i| + extract(ver, res, :unsupported, + XPATH_UNSUPPORTED_LABEL.gsub(/_n_/, i.next.to_s), + XPATH_UNSUPPORTED_VERSION, XPATH_UNSUPPORTED_LINK) + end + res end - private def extract(ver, res, type, xpath_group, xpath_version, xpath_link) - repo = ''; format = ''; version = nil + repo = "" + format = "" + version = nil ver.xpath(xpath_group).each do |pack| - version = pack.xpath(xpath_version).text.strip if version.empty? version = @old_version else @@ -175,151 +196,143 @@ else @old_repo = repo unless repo =~ /Expert Download/ end end - #puts repo, link, format link = expand_link(pack.xpath(xpath_link).text) if repo =~ /Expert Download/ - res[:versions] << { distro: ver.text, link: link, type: type, repo: @old_repo, format: :extra, version: version} + res[:versions] << { distro: ver.text.gsub(/:/, " "), link: link, type: type, + repo: @old_repo, format: :extra, version: version } next end - next if format.to_s.empty? || (link.include?('/package/show/')) + next if format.to_s.empty? || link.include?("/package/show/") - res[:versions] << { distro: ver.text, link: link, type: type, repo: repo, format: format, version: version } + res[:versions] << { distro: ver.text, link: link, type: type, repo: repo, + format: format, version: version } end - end def format?(str) - PageData::FORMATS.has_value? str + PageData::FORMATS.value? str end - end - + # + # Scraping class for Packman page. + # class Packman < PageData + URL = "http://packman.links2linux.org" - URL = 'http://packman.links2linux.org' - XPATH_NAME = '//td[@id="package-details-header-name"]' XPATH_DESC = '//div[@id="package-description"]' XPATH_PACKAGES = '//td[@id="package-details-left"]//tbody/tr' - XPATH_VERSION = './/td[1]' - XPATH_DISTRO = './/td[2]' - XPATH_FORMAT = './/td[3]' - XPATH_LINK = './/a/@href' + XPATH_VERSION = ".//td[1]" + XPATH_DISTRO = ".//td[2]" + XPATH_FORMAT = ".//td[3]" + XPATH_LINK = ".//a/@href" def data res = {} - res[:name] = @page.xpath(XPATH_NAME).text - res[:short_description] = '' + res[:short_description] = "" res[:description] = @page.xpath(XPATH_DESC).text res[:versions] = [] - @page.xpath(XPATH_PACKAGES).each do |pack| - - version = pack.xpath(XPATH_VERSION).text.split('-')[0].to_s - distro = pack.xpath(XPATH_DISTRO).text.gsub(/_/, ' ') + version = pack.xpath(XPATH_VERSION).text.split("-")[0].to_s + distro = pack.xpath(XPATH_DISTRO).text.gsub(/_/, " ") format = pack.xpath(XPATH_FORMAT).text.strip.to_sym link = pack.xpath(XPATH_LINK).text res[:versions] << { format: format, version: version, distro: distro, type: :supported, link: "http://packman.links2linux.org#{link}", - repo: 'Packman' } + repo: "Packman" } end res end end - end - module Links - + # + # Scraping class for openSUSE links. + # class Opensuse < PageData - XPATH_REPO = '//*[@id="manualopenSUSE"]/h5' - XPATH_REPO_DISTRO = './strong[1]' - XPATH_REPO_LINK = 'following-sibling::pre[1]' + XPATH_REPO_DISTRO = "./strong[1]" + XPATH_REPO_LINK = "following-sibling::pre[1]" XPATH_PACKAGE_GROUP = '//*[@id="directopenSUSE"]/div/div' - XPATH_PACKAGE_DISTRO = './p/strong' - XPATH_PACKAGE_LINK = './/@href' + XPATH_PACKAGE_DISTRO = "./p/strong" + XPATH_PACKAGE_LINK = ".//@href" def data res = { versions: [] } extract(res, -1, XPATH_REPO, XPATH_REPO_DISTRO, XPATH_REPO_LINK) extract(res, -2, XPATH_PACKAGE_GROUP, XPATH_PACKAGE_DISTRO, XPATH_PACKAGE_LINK) res end - private def extract(res, format_idx, xpath_group, xpath_distro, xpath_link) @page.xpath(xpath_group).each do |section| - distro = '' + distro = "" section.xpath(xpath_distro).each do |subsection| distro = subsection.text distro = "openSUSE Leap #{distro}" if distro =~ /^\d\d.\d$/ end - #p distro section.xpath(xpath_link).each do |subsection| link = subsection.text - link = link.gsub("\n", ' ').scan(/(https:\/\/[^ \n]+)/).pop.pop + link = link.gsub("\n", " ").scan(%r{(https://[^ \n]+)}).pop.pop res[:versions] << { distro: distro, - format: File.basename(link).split('.')[format_idx].to_sym, - link: link, + format: File.basename(link).split(".")[format_idx].to_sym, + link: link } - #p link end end - end end + # + # Scraping class for Packman links. + # class Packman < PageData + URL = "http://packman.links2linux.org" XPATH_LINK_DISTRO = '//*[@id="selected-release"]/td[2]' XPATH_LINK_BIN = '//*[@id="package-details-binfiles"]//a/@href' XPATH_LINK_SRC = '//*[@id="package-details-srcfile-heading"]//a/@href' XPATH_LINK_YMP = '//*[@class="ymp"]//a/@href' - URL = 'http://packman.links2linux.org' - def data res = { versions: [] } - - distro = @page.xpath(XPATH_LINK_DISTRO).text.gsub(/\_/, ' ') - + distro = @page.xpath(XPATH_LINK_DISTRO).text.gsub(/_/, " ") @page.xpath(XPATH_LINK_BIN).each do |pack| link = pack.text res[:versions] << { distro: distro, - format: File.basename(link).split('.')[-2].to_sym, + format: File.basename(link).split(".")[-2].to_sym, link: URL + link } end link = res[:versions].last[:link] is_lang = (File.basename(link) =~ /-lang/) && (res[:versions].last[:format] == :noarch) link = @page.xpath(XPATH_LINK_SRC).text res[:versions] << { distro: distro, - format: is_lang ? :lsrc : File.basename(link).split('.')[-2].to_sym, + format: is_lang ? :lsrc : File.basename(link).split(".")[-2].to_sym, link: URL + link } unless is_lang link = @page.xpath(XPATH_LINK_YMP).text @@ -331,12 +344,9 @@ end res end end - end - - end # Data module - + end end end