path_extractors/generic.rb in arachni-0.3 vs path_extractors/generic.rb in arachni-0.4

- old
+ new

@@ -1,8 +1,8 @@ =begin Arachni - Copyright (c) 2010-2011 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> + Copyright (c) 2010-2012 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> This is free software; you can copy and distribute and modify this program under the term of the GPL v2.0 License (See LICENSE file for details) @@ -18,11 +18,11 @@ # full URLs. # # @author: Tasos "Zapotek" Laskos # <tasos.laskos@gmail.com> # <zapotek@segfault.gr> -# @version: 0.1 +# @version: 0.2 # class Generic < Paths # # Returns an array of paths as plain strings @@ -31,13 +31,57 @@ # # @return [Array<String>] paths # def run( doc ) begin - URI.extract( doc.to_s ) + html = doc.to_s + URI.extract( html, ['http', 'https'] ).map { + |u| + + # + # This extractor needs to be a tiny bit intelligent because + # due to its generic nature it'll inevitably match some garbage. + # + # For example, if some JS code contains: + # + # var = 'http://blah.com?id=1' + # + # or + # + # var = { 'http://blah.com?id=1', 1 } + # + # + # The URI.extract call will match: + # + # http://blah.com?id=1' + # + # and + # + # http://blah.com?id=1', + # + # respectively. + # + # + if !includes_quotes?( u ) + u + else + + if html.include?( '\'' + u ) + u.split( '\'' ).first + elsif html.include?( '"' + u ) + u.split( '"' ).first + else + u + end + end + } rescue return [] end + end + + def includes_quotes?( url ) + url.include?( '\'' ) || url.include?( '"' ) end end end