path_extractors/generic.rb in arachni-0.4.0.4 vs path_extractors/generic.rb in arachni-0.4.1

- old
+ new

@@ -1,87 +1,84 @@ =begin - Arachni - Copyright (c) 2010-2012 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> + Copyright 2010-2012 Tasos Laskos <tasos.laskos@gmail.com> - This is free software; you can copy and distribute and modify - this program under the term of the GPL v2.0 License - (See LICENSE file for details) + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. =end -module Arachni::Parser::Extractors +require 'uri' # # Extract URLs from arbitrary text. # # You might think that this renders the rest path extractors redundant # but the others can extract paths from HTML attributes, this one can only extract # full URLs. # -# @author: Tasos "Zapotek" Laskos -# <tasos.laskos@gmail.com> -# <zapotek@segfault.gr> -# @version: 0.2 +# @author Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> # -class Generic < Paths +# @version 0.2.1 +# +class Arachni::Parser::Extractors::Generic < Arachni::Parser::Extractors::Base # # Returns an array of paths as plain strings # - # @param [Nokogiri] Nokogiri document + # @param [Nokogiri] doc Nokogiri document # # @return [Array<String>] paths # def run( doc ) - begin - html = doc.to_s - URI.extract( html, ['http', 'https'] ).map { - |u| - - # - # This extractor needs to be a tiny bit intelligent because - # due to its generic nature it'll inevitably match some garbage. - # - # For example, if some JS code contains: - # - # var = 'http://blah.com?id=1' - # - # or - # - # var = { 'http://blah.com?id=1', 1 } - # - # - # The URI.extract call will match: - # - # http://blah.com?id=1' - # - # and - # - # http://blah.com?id=1', - # - # respectively. - # - # - if !includes_quotes?( u ) - u + URI.extract( doc.to_s, %w(http https) ).map do |u| + # + # This extractor needs to be a tiny bit intelligent because + # due to its generic nature it'll inevitably match some garbage. + # + # For example, if some JS code contains: + # + # var = 'http://blah.com?id=1' + # + # or + # + # var = { 'http://blah.com?id=1', 1 } + # + # + # The URI.extract call will match: + # + # http://blah.com?id=1' + # + # and + # + # http://blah.com?id=1', + # + # respectively. + # + if !includes_quotes?( u ) + u + else + if html.include?( "'#{u}" ) + u.split( '\'' ).first + elsif html.include?( "\"#{u}" ) + u.split( '"' ).first else - - if html.include?( '\'' + u ) - u.split( '\'' ).first - elsif html.include?( '"' + u ) - u.split( '"' ).first - else - u - end + u end - } - rescue - return [] + end end + rescue + [] end def includes_quotes?( url ) url.include?( '\'' ) || url.include?( '"' ) end -end end