#!/usr/bin/perl -w
# get-link-desc
#
# USAGE:
#
# call this program with at least one URL on the command line. it
# will go through each URL, trying to find all links ("a href")
# pointers, as well as the text which accompanies it within each
# link.
#
# that is, if it sees
#
# Fnord!
#
# it should print out:
#
# foo.html
# Fnord!
#
# LEGAL:
# written 1999-08-02 by Tkil
# released into the public domain.
use strict;
# usual battery of modules...
use HTML::Parser;
use HTML::Entities;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
# ======================================================================
package LinkDescParser;
# obligatory inheritance
@LinkDescParser::ISA = qw(HTML::Parser);
sub new
{
my $class = shift;
my $self = HTML::Parser->new(@_);
# stuff for LinkDescParser
$self->{ _ldp_ld_stack } = [];
$self->{ _ldp_in_a } = 0;
$self->{ _ldp_href } = '';
$self->{ _ldp_text } = '';
return bless $self, __PACKAGE__;
}
sub start
{
my $self = shift;
my ($tag, $attr, $attrseq, $origtext) = @_;
if ($tag eq 'a')
{
if ($self->{_ldp_in_a})
{
warn "recursive ... tags?!?!? in: " . $origtext;
}
else
{
$self->{_ldp_in_a} = 1;
}
$self->{_ldp_href} = $attr->{href};
}
elsif ($tag eq 'img' && $self->{_ldp_in_a} && exists $attr->{alt})
{
$self->{_ldp_text} .= $attr->{alt};
}
$self->SUPER::start($self, @_);
}
sub text
{
my $self = shift;
my ($text) = @_;
if ($self->{_ldp_in_a})
{
$self->{_ldp_text} .= $text;
}
$self->SUPER::text(@_);
}
sub end
{
my $self = shift;
my ($tag, $origtext) = @_;
if ($tag eq 'a')
{
if ($self->{_ldp_in_a})
{
$self->{_ldp_in_a} = 0;
push @{ $self->{_ldp_ld_stack} },
[ $self->{_ldp_href},
HTML::Entities::decode( $self->{_ldp_text} ) ];
$self->{_ldp_href} = '';
$self->{_ldp_text} = '';
}
else
{
warn "found when not in ... '$origtext'";
return;
}
}
$self->SUPER::end($self, @_);
}
sub get_links_and_desc
{
my $self = shift;
return $self->{_ldp_ld_stack};
}
# ======================================================================
package main;
my $ua = LWP::UserAgent->new;
URL:
foreach my $url (@ARGV)
{
print "=" x 20, " $url ", "=" x 20, "\n";
# try to get the content
my $req = HTTP::Request->new('GET', $url);
my $resp = $ua->request($req);
# did it work?
unless ($resp->is_success())
{
print
"error: ", $resp->code, "\n",
$resp->content, "\n";
next URL;
}
print "*** base: \"", $resp->base, "\"\n";
# try to parse out the links + descriptions
my $ldp = LinkDescParser->new;
$ldp->parse($resp->content);
$ldp->eof;
# then print them all out.
my $links_and_desc = $ldp->get_links_and_desc;
foreach (@$links_and_desc)
{
my ($link, $desc) = @$_;
# trim some whitespace
$desc =~ s/^\s+//;
$desc =~ s/\s+$//;
print $link, "\n\t", $desc, "\n\n";
}
}