#!/usr/bin/perl -w # get-link-desc # # USAGE: # # call this program with at least one URL on the command line. it # will go through each URL, trying to find all links ("a href") # pointers, as well as the text which accompanies it within each # link. # # that is, if it sees # # Fnord! # # it should print out: # # foo.html # Fnord! # # LEGAL: # written 1999-08-02 by Tkil # released into the public domain. use strict; # usual battery of modules... use HTML::Parser; use HTML::Entities; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; # ====================================================================== package LinkDescParser; # obligatory inheritance @LinkDescParser::ISA = qw(HTML::Parser); sub new { my $class = shift; my $self = HTML::Parser->new(@_); # stuff for LinkDescParser $self->{ _ldp_ld_stack } = []; $self->{ _ldp_in_a } = 0; $self->{ _ldp_href } = ''; $self->{ _ldp_text } = ''; return bless $self, __PACKAGE__; } sub start { my $self = shift; my ($tag, $attr, $attrseq, $origtext) = @_; if ($tag eq 'a') { if ($self->{_ldp_in_a}) { warn "recursive ... tags?!?!? in: " . $origtext; } else { $self->{_ldp_in_a} = 1; } $self->{_ldp_href} = $attr->{href}; } elsif ($tag eq 'img' && $self->{_ldp_in_a} && exists $attr->{alt}) { $self->{_ldp_text} .= $attr->{alt}; } $self->SUPER::start($self, @_); } sub text { my $self = shift; my ($text) = @_; if ($self->{_ldp_in_a}) { $self->{_ldp_text} .= $text; } $self->SUPER::text(@_); } sub end { my $self = shift; my ($tag, $origtext) = @_; if ($tag eq 'a') { if ($self->{_ldp_in_a}) { $self->{_ldp_in_a} = 0; push @{ $self->{_ldp_ld_stack} }, [ $self->{_ldp_href}, HTML::Entities::decode( $self->{_ldp_text} ) ]; $self->{_ldp_href} = ''; $self->{_ldp_text} = ''; } else { warn "found when not in ... '$origtext'"; return; } } $self->SUPER::end($self, @_); } sub get_links_and_desc { my $self = shift; return $self->{_ldp_ld_stack}; } # ====================================================================== package main; my $ua = LWP::UserAgent->new; URL: foreach my $url (@ARGV) { print "=" x 20, " $url ", "=" x 20, "\n"; # try to get the content my $req = HTTP::Request->new('GET', $url); my $resp = $ua->request($req); # did it work? unless ($resp->is_success()) { print "error: ", $resp->code, "\n", $resp->content, "\n"; next URL; } print "*** base: \"", $resp->base, "\"\n"; # try to parse out the links + descriptions my $ldp = LinkDescParser->new; $ldp->parse($resp->content); $ldp->eof; # then print them all out. my $links_and_desc = $ldp->get_links_and_desc; foreach (@$links_and_desc) { my ($link, $desc) = @$_; # trim some whitespace $desc =~ s/^\s+//; $desc =~ s/\s+$//; print $link, "\n\t", $desc, "\n\n"; } }