generic.rb in arachni-0.4

- old
+ new

@@ -1,8 +1,8 @@
 =begin
                   Arachni
-  Copyright (c) 2010-2011 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com>
+  Copyright (c) 2010-2012 Tasos "Zapotek" Laskos <tasos.laskos@gmail.com>
 
   This is free software; you can copy and distribute and modify
   this program under the term of the GPL v2.0 License
   (See LICENSE file for details)
 
@@ -18,11 +18,11 @@
 # full URLs.
 #
 # @author: Tasos "Zapotek" Laskos
 #                                      <tasos.laskos@gmail.com>
 #                                      <zapotek@segfault.gr>
-# @version: 0.1
+# @version: 0.2
 #
 class Generic < Paths
 
     #
     # Returns an array of paths as plain strings
@@ -31,13 +31,57 @@
     #
     # @return   [Array<String>]  paths
     #
     def run( doc )
         begin
-            URI.extract( doc.to_s )
+            html = doc.to_s
+            URI.extract( html, ['http', 'https'] ).map {
+                |u|
+
+                #
+                # This extractor needs to be a tiny bit intelligent because
+                # due to its generic nature it'll inevitably match some garbage.
+                #
+                # For example, if some JS code contains:
+                #
+                #    var = 'http://blah.com?id=1'
+                #
+                # or
+                #
+                #    var = { 'http://blah.com?id=1', 1 }
+                #
+                #
+                # The URI.extract call will match:
+                #
+                #    http://blah.com?id=1'
+                #
+                # and
+                #
+                #    http://blah.com?id=1',
+                #
+                # respectively.
+                #
+                #
+                if !includes_quotes?( u )
+                    u
+                else
+
+                    if html.include?( '\'' + u )
+                        u.split( '\'' ).first
+                    elsif html.include?( '"' + u )
+                        u.split( '"' ).first
+                    else
+                        u
+                    end
+                end
+            }
         rescue
             return []
         end
+    end
+
+    def includes_quotes?( url )
+        url.include?( '\'' ) || url.include?( '"' )
     end
 
 end
 end