#!/usr/bin/perl -Tw
# Extract forms from an HTML file and build a new page for them.
# See POD at end for more explanation.
#
# Eli the Bearded 26 April 2001
use strict;
use CGI; # oh so handy for CGI parsing
use URI; # oh so handy for normalizing URLs
require LWP::UserAgent; # oh so handy for fetching pages
package RewriteForm; # our package, a subclass of HTML::Parser
use base "HTML::Parser";# oh so handy for HTML parsing
use vars qw( $textarea $isoption $selname %radio $origpage
%entity $parser $ua $request $response $query );
%entity = (
'&' => '&',
'<' => '<',
'>' => '>',
);
$textarea = 0;
$selname = &defselname();
sub defselname () {
# What to show for a select without a name.
return 'undef';
} # end &defselname
sub defradioname () {
# What to show for a radio input without a name.
return 'undef';
} # end &defradioname
sub defsulmultcount () {
# How many inputs to show for a select multiple
return 5;
} # end &defsulmultcount
sub headers() {
# HTTP headers
return "Content-Type: text/html\n\n";
} # end &headers
# Callback for a tag start.
sub start {
# oo, text, hashref, arrayref, text
my ($self, $tag, $attr, $attrseq, $origtext) = @_;
my $esctext;
my $newtext;
my @allowed;
my $selmult = 0;
if ($tag =~ /^(form|input|select|option|textarea)$/i) {
my $tagtype = lc($1);
my $pre = '';
my $qstr = '';
my $post = "\n
\n";
my $skipthis = 0; # used for radio buttons after the first
$esctext = "e($origtext);
if ($tagtype ne 'option' and $isoption) {
$isoption = 0;
$pre .= "Was still in \$isoption state. Bad HTML? ";
}
if ($tagtype eq 'input') {
my $inptype;
@allowed = ( 'type', 'name', 'value', 'accept', 'checked' );
# We lc() the type since attribute values have the case preserved
# by HTML::Parser. (Attribute names have been lower-cased for us.)
if (defined($$attr{type}) and $$attr{type} = lc($$attr{type}) and
($$attr{type} eq 'hidden' or $$attr{type} eq 'password' or
$$attr{type} eq 'checkbox' )) {
$inptype = $$attr{type};
$$attr{type} = 'text';
} else {
$inptype = $$attr{type};
}
if (defined($inptype)) {
$qstr = "e($inptype);
$pre .= "Input type $qstr
, ";
} else {
$pre .= "Input type unrecognized, ";
}
if (defined($$attr{name})) {
$qstr = "e($$attr{name});
$pre .= "named $qstr
, ";
if (defined($inptype) and $inptype eq 'radio') {
if (defined($radio{$qstr})) {
$post = "Input values for the radio button in the $qstr text " .
"input. " . $post;
$skipthis = 1;
} else {
$radio{$qstr} = 1;
$$attr{type} = 'text';
$post = "Use this text input for $qstr the radio buttons. " .
$post;
}
}
} else {
$pre .= "no name found, ";
}
if (!defined($inptype) or ($inptype !~ /hidden|password|text/)) {
if (defined($$attr{value})) {
$qstr = "e($$attr{value});
$pre .= "with value $qstr
, ";
} else {
$pre .= "with no value, ";
}
}
if (defined($$attr{src})) {
my $newimage = URI->new_abs( $$attr{src}, $origpage );
$$attr{src} = $newimage;
push(@allowed, qw( src border height width ));
}
$pre .= "original HTML
$esctext\n"; } elsif ($tagtype eq 'select') { $selname = &defselname(); # @allowed = ( 'name', 'multiple' ); if (defined($selname = $$attr{name})) { $selname = quote($selname); $pre .= "Turning select $selname into text input. Any option "; $pre .= "values for this select will be printed. "; @allowed = ( 'type', 'name' ); $tagtype='input'; $$attr{name} = 'text'; if(defined($$attr{multiple})) { $pre .= "Note that this select allows multiple inputs, so "; $pre .= "more than one text input follows. "; $selmult = 1; } } } elsif ($tagtype eq 'option') { @allowed = ( 'value', 'selected' ); $isoption = 1; $pre = "Original option HTML
$esctext\n
";
} elsif ($tagtype eq 'form') {
@allowed = ( 'action', 'method', 'enctype', 'name' );
if (defined($$attr{action})) {
my $newaction = URI->new_abs( $$attr{action}, $origpage );
$$attr{action} = $newaction;
$qstr = "e($newaction);
$pre .= "Form has action $qstr
, ";
} else {
$pre .= "No action found for form, ";
}
if (defined($$attr{method})) {
$qstr = "e($$attr{method});
$pre .= "method $qstr
, ";
} else {
$pre .= "default method, ";
}
$pre .= "original HTML $esctext\n"; } elsif ($tagtype eq 'textarea') { @allowed = ( 'name', 'cols', 'rows', 'wrap' ); $textarea = 1; $pre .= "Original HTML
$esctext\n"; } $newtext = '<' . $tagtype; for $_ (@allowed) { if (exists($$attr{$_})) { $newtext .= ' ' . $_; if (defined($$attr{$_})) { $newtext .= '="' . $$attr{$_} . '"'; } } } $newtext .= '>'; print $pre; if ($selmult) { my $i; print "
$text
$esctext\n"; print $origtext unless $wasoption; print $post unless $wasoption; } } # end &end # Quote HTML for safe printing. sub quote ($) { my $string = shift; $string =~ s/([<>&])/$entity{$1}/g; return $string; } $origpage = $ARGV[0]; if(defined($origpage)) { if ($origpage =~ /^-+h/i) { print "Read POD in $0 for help.\n'perldoc $0' should work.\n"; exit; } if ($origpage =~ m<^(?i:http)(://[!-~]{1,2000})$>) { $origpage = "http$1"; } else { print "Unrecognized usage. Use as a CGI or read the POD for help.\n"; print "'perldoc $0' should work.\n"; exit; } } else { $query = new CGI; $origpage = $query->param('url'); print &headers(); } if (defined($origpage)) { if($origpage =~ m<^(?i:http)(://[!-~]{1,2000})$>) { # untainted $origpage = "http$1"; $ua = LWP::UserAgent->new; $request = HTTP::Request->new('GET', $origpage); $response = $ua->request($request); print "
page is " . "e($origpage) . "
\n";
print "page size is " . length($response->content) . " bytes
url
parameter.\n";
}
} else {
my $form = $ENV{SCRIPT_NAME};
print "POD documentation
\n\n"; } print; } if ($. > 1) { print "\n\n\n"; } else { print "\n
Didn't find POD to print\n";
}
}
__DATA__
=pod
=head1 NAME
extract-form : HTML form rewriter for command line or CGI use
=head1 DESCRIPTION
This script will fetch an HTML page via HTTP and extract all the
forms out of it. The forms will be rewritten to expose all hidden
inputs, etc, so that random values can be substituted in. Also
Javascript in the page to verify inputs, etc, will be stripped.
Useful for seeing how CGI programs deal with non-sanctioned input.
During the course of rewriting the forms the script will convert
E