module Bibliothecary module Parsers class Pypi include Bibliothecary::Analyser INSTALL_REGEXP = /install_requires\s*=\s*\[([\s\S]*?)\]/ # Capture Group 1 is package. # Optional Group 2 is [extras]. # Capture Group 3 is Version REQUIRE_REGEXP = /([a-zA-Z0-9]+[a-zA-Z0-9\-_\.]+)(?:\[.*?\])*([><=\w\.,]+)?/ REQUIREMENTS_REGEXP = /^#{REQUIRE_REGEXP}/ MANIFEST_REGEXP = /.*require[^\/]*(\/)?[^\/]*\.(txt|pip|in)$/ # TODO: can this be a more specific regexp so it doesn't match something like ".yarn/cache/create-require-npm-1.0.0.zip"? PIP_COMPILE_REGEXP = /.*require.*$/ # Adapted from https://peps.python.org/pep-0508/#names PEP_508_NAME_REGEXP = /^([A-Z0-9][A-Z0-9._-]*[A-Z0-9]|[A-Z0-9])/i def self.mapping { match_filenames("requirements-dev.txt", "requirements/dev.txt", "requirements-docs.txt", "requirements/docs.txt", "requirements-test.txt", "requirements/test.txt", "requirements-tools.txt", "requirements/tools.txt") => { kind: "manifest", parser: :parse_requirements_txt, }, lambda { |p| PIP_COMPILE_REGEXP.match(p) } => { content_matcher: :pip_compile?, kind: "lockfile", parser: :parse_requirements_txt, }, lambda { |p| MANIFEST_REGEXP.match(p) } => { kind: "manifest", parser: :parse_requirements_txt, can_have_lockfile: false, }, match_filename("requirements.frozen") => { # pattern exists to store frozen deps in requirements.frozen parser: :parse_requirements_txt, kind: "lockfile", }, match_filename("pip-resolved-dependencies.txt") => { # Inferred from pip kind: "lockfile", parser: :parse_requirements_txt, }, match_filename("pip-dependency-graph.json") => { # Exported from pipdeptree --json kind: "lockfile", parser: :parse_dependency_tree_json, }, match_filename("setup.py") => { kind: "manifest", parser: :parse_setup_py, can_have_lockfile: false, }, match_filename("Pipfile") => { kind: "manifest", parser: :parse_pipfile, }, match_filename("Pipfile.lock") => { kind: "lockfile", parser: :parse_pipfile_lock, }, match_filename("pyproject.toml") => { kind: "manifest", parser: :parse_pyproject, }, match_filename("poetry.lock") => { kind: "lockfile", parser: :parse_poetry_lock, }, # Pip dependencies can be embedded in conda environment files match_filename("environment.yml") => { parser: :parse_conda, kind: "manifest", }, match_filename("environment.yaml") => { parser: :parse_conda, kind: "manifest", }, match_filename("environment.yml.lock") => { parser: :parse_conda, kind: "lockfile", }, match_filename("environment.yaml.lock") => { parser: :parse_conda, kind: "lockfile", }, } end add_multi_parser(Bibliothecary::MultiParsers::CycloneDX) add_multi_parser(Bibliothecary::MultiParsers::DependenciesCSV) add_multi_parser(Bibliothecary::MultiParsers::Spdx) def self.parse_pipfile(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument manifest = Tomlrb.parse(file_contents) map_dependencies(manifest["packages"], "runtime") + map_dependencies(manifest["dev-packages"], "develop") end def self.parse_pyproject(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument deps = [] file_contents = Tomlrb.parse(file_contents) # Parse poetry [tool.poetry] deps poetry_manifest = file_contents.fetch("tool", {}).fetch("poetry", {}) deps += map_dependencies(poetry_manifest["dependencies"], "runtime") # Poetry 1.0.0-1.2.0 way of defining dev deps deps += map_dependencies(poetry_manifest["dev-dependencies"], "develop") # Poetry's 1.2.0+ of defining dev deps poetry_manifest .fetch("group", {}) .each_pair do |group_name, obj| group_name = "develop" if group_name == "dev" deps += map_dependencies(obj.fetch("dependencies", {}), group_name) end # Parse PEP621 [project] deps pep621_manifest = file_contents.fetch("project", {}) pep621_deps = pep621_manifest.fetch("dependencies", []).map { |d| parse_pep_508_dep_spec(d) } deps += map_dependencies(pep621_deps, "runtime") # We're combining both poetry+PEP621 deps instead of making them mutually exclusive, until we # find a reason not to ingest them both. deps.uniq end # TODO: this was deprecated in 8.6.0. Remove this in any major version bump >= 9.* def self.parse_poetry(file_contents, options: {}) puts "Warning: parse_poetry() is deprecated, use parse_pyproject() instead." parse_pyproject(file_contents, options) end def self.parse_conda(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument contents = YAML.safe_load(file_contents) return [] unless contents dependencies = contents["dependencies"] pip = dependencies.find { |dep| dep.is_a?(Hash) && dep["pip"]} return [] unless pip Pypi.parse_requirements_txt(pip["pip"].join("\n")) end def self.map_dependencies(packages, type) return [] unless packages packages.map do |name, info| { name: name, requirement: map_requirements(info), type: type, } end end def self.map_requirements(info) if info.is_a?(Hash) if info["version"] info["version"] elsif info["git"] info["git"] + "#" + info["ref"] else "*" end else info || "*" end end def self.parse_pipfile_lock(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument manifest = JSON.parse(file_contents) deps = [] manifest.each do |group, dependencies| next if group == "_meta" group = "runtime" if group == "default" dependencies.each do |name, info| deps << { name: name, requirement: map_requirements(info), type: group, } end end deps end def self.parse_poetry_lock(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument manifest = Tomlrb.parse(file_contents) deps = [] manifest["package"].each do |package| # next if group == "_meta" group = case package["category"] when "dev" "develop" else "runtime" end deps << { name: package["name"], requirement: map_requirements(package), type: group, } end deps end def self.parse_setup_py(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument match = file_contents.match(INSTALL_REGEXP) return [] unless match deps = [] match[1].gsub(/',(\s)?'/, "\n").split("\n").each do |line| next if line.match(/^#/) match = line.match(REQUIRE_REGEXP) next unless match deps << { name: match[1], requirement: match[-1] || "*", type: "runtime", } end deps end # While the thing in the repo that PyPI is using might be either in # egg format or wheel format, PyPI uses "egg" in the fragment of the # VCS URL to specify what package in the PyPI index the VCS URL # should be treated as. NoEggSpecified = Class.new(ArgumentError) def self.parse_dependency_tree_json(file_contents, options: {}) JSON.parse(file_contents) .map do |pkg| { name: pkg.dig("package", "package_name"), requirement: pkg.dig("package", "installed_version"), type: "runtime", } end .uniq end # Parses a requirements.txt file, following the # https://pip.pypa.io/en/stable/cli/pip_install/#requirement-specifiers # and https://pip.pypa.io/en/stable/topics/vcs-support/#git. # Invalid lines in requirements.txt are skipped. def self.parse_requirements_txt(file_contents, options: {}) # rubocop:disable Lint/UnusedMethodArgument deps = [] type = case options[:filename] when /dev/ || /docs/ || /tools/ "development" when /test/ "test" else "runtime" end file_contents.split("\n").each do |line| if line["://"] begin result = parse_requirements_txt_url(line) rescue URI::Error, NoEggSpecified next end deps << result.merge( type: type ) elsif (match = line.delete(" ").match(REQUIREMENTS_REGEXP)) deps << { name: match[1], requirement: match[-1] || "*", type: type, } end end deps.uniq end def self.parse_requirements_txt_url(url) uri = URI.parse(url) raise NoEggSpecified, "No egg specified in #{url}" unless uri.fragment name = uri.fragment[/^egg=([^&]+)([&]|$)/, 1] raise NoEggSpecified, "No egg specified in #{url}" unless name requirement = uri.path[/@(.+)$/, 1] { name: name, requirement: requirement || "*" } end def self.pip_compile?(file_contents) return file_contents.include?("This file is autogenerated by pip-compile") rescue Exception # rubocop:disable Lint/RescueException # We rescue exception here since native libs can throw a non-StandardError # We don't want to throw errors during the matching phase, only during # parsing after we match. false end # Simply parses out the name of a PEP 508 Dependency specification: https://peps.python.org/pep-0508/ # Leaves the rest as-is with any leading semicolons or spaces stripped def self.parse_pep_508_dep_spec(dep) name, requirement = dep.split(PEP_508_NAME_REGEXP, 2).last(2).map(&:strip) requirement = requirement.sub(/^[\s;]*/, "") requirement = "*" if requirement == "" return name, requirement end end end end