curl/perl/contrib/getlinks.pl.in

262 lines
5.9 KiB
Perl
Raw Normal View History

2001-03-16 13:04:57 +00:00
#!@PERL@
#
# getlinks.pl
#
# This script extracts all links from a HTML page, compares them to a pattern
# entered on the command line and then downloads matching links into the
# target dir (also specified on the command line).
#
# Written to use 'curl' for URL fetching, uses the source file names in the
# target directory.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.1 Oct 7, 1998
#
# HISTORY
#
# 0.1 - Created now!
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
if($ARGV[0] eq "-d" ) {
$display = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
$getdir = $ARGV[1];
$getregex = $ARGV[2];
if(($geturl eq "") ||
(($getdir eq "") && !$display) ||
$help) {
print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
" Use a traling slash for directory URLs!\n",
" Use \"quotes\" around the regex!\n",
" -h This help text\n",
" -d Display matches only instead of downloading\n",
" -v Verbose mode\n";
exit;
}
# change to target directory:
chdir $getdir ||
die "couldn't cd into $getdir";
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# urlget, URL will be appended to the right of this command line
# this stores the file with the remote file name in the current dir
$urlget = "curl -O -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
2010-02-16 13:32:45 +00:00
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
2001-03-16 13:04:57 +00:00
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
2010-02-16 13:32:45 +00:00
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
2011-12-30 03:36:18 +01:00
2010-02-16 13:32:45 +00:00
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
2010-02-14 19:40:18 +00:00
2001-03-16 13:04:57 +00:00
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
2010-02-16 13:32:45 +00:00
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
2001-03-16 13:04:57 +00:00
}
else {
2010-02-16 13:32:45 +00:00
print "Couldn't parse the specified URL, retry please!\n";
exit;
2001-03-16 13:04:57 +00:00
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
2010-02-16 13:32:45 +00:00
die "Couldn't get web page for some reason";
2001-03-16 13:04:57 +00:00
headget:
while(<HEADGET>) {
2010-02-16 13:32:45 +00:00
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
2001-03-16 13:04:57 +00:00
}
close(HEADGET);
if($pagemoved == 1) {
2010-02-16 13:32:45 +00:00
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
2001-03-16 13:04:57 +00:00
}
open(WEBGET, "$htmlget $geturl|") ||
2010-02-16 13:32:45 +00:00
die "Couldn't get web page for some reason";
2001-03-16 13:04:57 +00:00
while(<WEBGET>) {
2010-02-16 13:32:45 +00:00
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
2001-03-16 13:04:57 +00:00
}
close(WEBGET);
}
else {
while(<STDIN>) {
2010-02-16 13:32:45 +00:00
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
2001-03-16 13:04:57 +00:00
}
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
2010-02-16 13:32:45 +00:00
# we have a tag in $1
$tag = $1;
2011-12-30 03:36:18 +01:00
2010-02-16 13:32:45 +00:00
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
2001-03-16 13:04:57 +00:00
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
2010-02-16 13:32:45 +00:00
$link = $url;
2001-03-16 13:04:57 +00:00
}
else {
2010-02-16 13:32:45 +00:00
# this is an absolute link on the same server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
2001-03-16 13:04:57 +00:00
}
if($link =~ /$getregex/) {
2010-02-16 13:32:45 +00:00
if($display) {
print "$link\n";
}
else {
if($verbose) {
print "Gets $link\n";
}
print `$urlget $link`;
}
2001-03-16 13:04:57 +00:00
}
}