#!/usr/bin/perl -w # # written by 2000-01-20 # # this is free software; you may use, modify, and distribute this # program under the same terms as perl itself. use strict; package FQParser; use HTML::Parser; use URI::URL; @FQParser::ISA = qw(HTML::Parser); # takes one optional argument, a base url. typically this will be the # url you fetched the contents from. sub new { my $class = shift; my $self = HTML::Parser->new(); bless $self, $class; # what is the base URL we should try to absolutize to? $self->{_fqp_base} = shift; # an accumulator for the output $self->{_fqp_out} = ''; # and a flag to tell us whether or not we're in the block $self->{_fqp_in_head} = 0; return $self; } # the "start" method is the only one that needs brains. and the "end" # method needs a small brain, since we need to track whether or not # we're in the section. sub start { my ($self, $tag, $attr, $attrseq, $orig_text) = @_; if ($tag eq 'head') { $self->{_fqp_in_head} = 1; } elsif ($self->{_fqp_in_head} && $tag eq 'base') { $self->{_fqp_base} = $attr->{href}; } elsif (exists $attr->{href} || exists $attr->{src}) { # need to munge it. for my $link_attr (qw/href src/) { next unless exists $attr->{$link_attr}; my $orig_url = $attr->{$link_attr}; my $new_url = URI::URL->new($attr->{$link_attr}, $self->{_fqp_base})->abs(); # normally i'd try to do this more gracefully, but this # should do the trick. eeek. if ($orig_url ne $new_url) { $orig_text =~ s{($link_attr)\s*=\s*([\"\']?)$orig_url\2} {$1="$new_url"}i; } } } $self->{_fqp_out} .= $orig_text; } sub end { my ($self, $tag, $orig_text) = @_; if ($tag eq 'head') { $self->{_fqp_in_head} = 0; } $self->{_fqp_out} .= $orig_text; } # the rest of the overrides just copy stuff along sub declaration { my ($self, $decl) = @_; $self->{_fqp_out} .= ""; } sub text { my ($self, $text) = @_; $self->{_fqp_out} .= $text; } sub comment { my ($self, $comment) = @_; $self->{_fqp_out} .= $comment; } # and finally, to get the results... sub get_doc { my $self = shift; return $self->{_fqp_out}; } ############################################################################# ############################################################################# package main; use LWP::Simple; for my $url (@ARGV) { my $fqp = FQParser->new($url); my $contents = get($url); $fqp->parse($contents); print $fqp->get_doc(); } exit 0;