perl: remove subdir, not touched in 9 years

This commit is contained in:
Daniel Stenberg 2015-05-24 00:54:55 +02:00
parent 3a973517a9
commit 28cdc0d82c
8 changed files with 0 additions and 1431 deletions

View File

@ -1,3 +0,0 @@
SUBDIRS = Curl_easy
EXTRA_DIST = README

View File

@ -1,24 +0,0 @@
_ _ ____ _
___| | | | _ \| |
/ __| | | | |_) | |
| (__| |_| | _ <| |___
\___|\___/|_| \_\_____|
Perl
Perl's a great script language, not the least for quick prototyping. Curl is
elegantly used from within it. You can either invoke external curl command
line or use the curl interface.
The latest release of Curl_easy, a Perl interface to curl is available from
http://curl.haxx.se/libcurl/perl/
(Georg Horn's original version of Curl_easy, supporting curl versions
before 7.7 is still available from: http://www.koblenz-net.de/~horn/export/ )
Using the Curl::easy module is just straightforward and
works much like using libcurl in a C programm, so please refer to the
documentation of libcurl. Have a look at test.pl to get an idea of how
to start.

View File

@ -1,336 +0,0 @@
#!@PERL@
#
# checklinks.pl
#
# This script extracts all links from a HTML page and checks their validity.
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.7 Sept 30, 1998
#
# HISTORY
#
# 0.5 - Cuts off the #-part from links before checking.
#
# 0.6 - Now deals with error codes 3XX better and follows the Location:
# properly.
# - Added the -x flag that only checks http:// -links
#
# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
# but a document. Now it does.
#
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-i" ) {
$usestdin = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-l" ) {
$linenumber = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-x" ) {
$external = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
if(($geturl eq "") || $help) {
print "Usage: $0 [-hilvx] <full URL>\n",
" Use a traling slash for directory URLs!\n",
" -h This help text\n",
" -i Read the initial page from stdin\n",
" -l Line number report for BAD links\n",
" -v Verbose mode\n",
" -x Check non-local (external?) links only\n";
exit;
}
if($ARGV[1] eq "-") {
print "We use stdin!\n";
$usestdin = 1;
}
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
#print length($in)."\n";
sub LinkWorks {
my $check = $_[0];
# URL encode:
# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
@doc = `$linkcheck \"$check\"`;
$head = 1;
# print "COMMAND: $linkcheck \"$check\"\n";
# print $doc[0]."\n";
boo:
if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
$error = $1;
if($error < 400 ) {
return "GOOD";
}
else {
if($head && ($error >= 500)) {
# This server doesn't like HEAD!
@doc = `$linkcheckfull \"$check\"`;
$head = 0;
goto boo;
}
return "BAD";
}
}
return "BAD";
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
$prot = $1;
# if($prot !~ /(http|ftp)/i) {
if($prot !~ /http/i) {
# this is an unsupported protocol, we ignore this
next linkloop;
}
$link = $url;
}
else {
if($external) {
next linkloop;
}
# this is a link on the save server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
#print "test $link\n";
#$success = "GOOD";
$success = &LinkWorks($link);
$count = $done{$url};
$allcount += $count;
print "$success $count <".$tagtype{$url}."> $link $url\n";
# If bad and -l, present the line numbers of the usage
if("BAD" eq $success) {
$badlinks++;
if($linenumber) {
$line =1;
for(@indoc) {
if($_ =~ /$url/) {
print " line $line\n";
}
$line++;
}
}
}
}
if($verbose) {
print "$allcount links were checked";
if($badlinks > 0) {
print ", $badlinks were found bad";
}
print "\n";
}

View File

@ -1,443 +0,0 @@
#!/usr/bin/perl
#
# crawlink.pl
#
# This script crawls across all found links below the given "root" URL.
# It reports all good and bad links to stdout. This code was based on the
# checklink.pl script I wrote ages ago.
#
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.3 Jan 3, 2001
#
# HISTORY
#
# 0.3 - The -i now adds regexes that if a full URL link matches one of those,
# it is not followed. This can then be used to prevent this script from
# following '.*\.cgi', specific pages or whatever.
#
# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
# faster to skip large non HTML files such as pdfs or big RFCs! ;-)
# Added a -c option that allows me to pass options to curl.
#
# 0.1 - The given url works as the root. This script will only continue
# and check other URLs if the leftmost part of the new URL is identical
# to the root URL.
#
use strict;
my $in="";
my $verbose=0;
my $usestdin;
my $linenumber;
my $help;
my $external;
my $curlopts;
my @ignorelist;
argv:
if($ARGV[0] eq "-v" ) {
$verbose++;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-c" ) {
$curlopts=$ARGV[1];
shift @ARGV;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-i" ) {
push @ignorelist, $ARGV[1];
shift @ARGV;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-l" ) {
$linenumber = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-x" ) {
$external = 1;
shift @ARGV;
goto argv;
}
my $geturl = $ARGV[0];
my $firsturl= $geturl;
#
# Define a hash array to hold all root URLs to visit/we have visited
#
my %rooturls;
$rooturls{$ARGV[0]}=1;
if(($geturl eq "") || $help) {
print "Usage: $0 [-hilvx] <full URL>\n",
" Use a traling slash for directory URLs!\n",
" -c [data] Pass [data] as argument to every curl invoke\n",
" -h This help text\n",
" -i [regex] Ignore root links that match this pattern\n",
" -l Line number report for BAD links\n",
" -v Verbose mode\n",
" -x Check non-local (external?) links only\n";
exit;
}
my $proxy;
if($curlopts ne "") {
$proxy=" $curlopts";
#$proxy =" -x 194.237.142.41:80";
}
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
my $linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
my $linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
my $htmlget = "curl -s$proxy";
# Parse the input URL and split it into the relevant parts:
my $getprotocol;
my $getserver;
my $getpath;
my $getdocument;
my %done;
my %tagtype;
my $allcount=0;
my $badlinks=0;
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
my @indoc;
sub GetRootPage {
my $geturl = $_[0];
my $in="";
my $code=200;
my $type="text/plain";
my $pagemoved=0;
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
while(<HEADGET>) {
#print STDERR $_;
if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) {
$code=$1;
if($code =~ /^3/) {
$pagemoved=1;
}
}
elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) {
$type=$1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
if($type ne "text/html") {
# there no point in getting anything but HTML
$in="";
}
else {
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
my $line = $_;
push @indoc, $line;
$line=~ s/\n/ /g;
$line=~ s/\r//g;
$in=$in.$line;
}
close(WEBGET);
}
return ($in, $code, $type);
}
sub LinkWorks {
my $check = $_[0];
# URL encode:
# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
my @doc = `$linkcheck \"$check\"`;
my $head = 1;
# print "COMMAND: $linkcheck \"$check\"\n";
# print $doc[0]."\n";
boo:
if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
my $error = $1;
if($error < 400 ) {
return "GOOD";
}
else {
if($head && ($error >= 500)) {
# This server doesn't like HEAD!
@doc = `$linkcheckfull \"$check\"`;
$head = 0;
goto boo;
}
return "BAD";
}
}
return "BAD";
}
sub GetLinks {
my $in = $_[0];
my @result;
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
my $tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) {
my $url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
if($verbose) {
print " FOUND $url but that is already checked\n";
}
next;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
while(1) {
$geturl=-1;
for(keys %rooturls) {
if($rooturls{$_} == 1) {
if($_ !~ /^$firsturl/) {
$rooturls{$_} += 1000; # don't do this, outside our scope
if($verbose) {
print "SKIP: $_\n";
}
next;
}
$geturl=$_;
last;
}
}
if($geturl == -1) {
last;
}
#
# Splits the URL in its different parts
#
&SplitURL($geturl);
#
# Returns the full HTML of the root page
#
my ($in, $error, $ctype) = &GetRootPage($geturl);
$rooturls{$geturl}++; # increase to prove we have already got it
if($ctype ne "text/html") {
# this is not HTML, we skip this
if($verbose == 2) {
print "Non-HTML link, skipping\n";
next;
}
}
if($error >= 400) {
print "ROOT page $geturl returned $error\n";
next;
}
print " ==== $geturl ====\n";
if($verbose == 2) {
printf("Error code $error, Content-Type: $ctype, got %d bytes\n",
length($in));
}
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
#
# Extracts all links from the given HTML buffer
#
my @links = &GetLinks($in);
for(@links) {
my $url = $_;
my $link;
if($url =~ /^([^:]+):/) {
my $prot = $1;
if($prot !~ /http/i) {
# this is an unsupported protocol, we ignore this
next;
}
$link = $url;
}
else {
if($external) {
next;
}
# this is a link on the same server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
my $nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
my $success = &LinkWorks($link);
my $count = $done{$url};
$allcount += $count;
print "$success $count <".$tagtype{$url}."> $link $url\n";
if("BAD" eq $success) {
$badlinks++;
if($linenumber) {
my $line =1;
for(@indoc) {
if($_ =~ /$url/) {
print " line $line\n";
}
$line++;
}
}
}
else {
# the link works, add it if it isn't in the ingore list
my $ignore=0;
for(@ignorelist) {
if($link =~ /$_/) {
$ignore=1;
}
}
if(!$ignore) {
# not ignored, add
$rooturls{$link}++; # check this if not checked already
}
}
}
}
if($verbose) {
print "$allcount links were checked";
if($badlinks > 0) {
print ", $badlinks were found bad";
}
print "\n";
}

View File

@ -1,193 +0,0 @@
#!/usr/bin/env perl
#
# formfind.pl
#
# This script gets a HTML page on stdin and presents form information on
# stdout.
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.2 Nov 18, 2002
#
# HISTORY
#
# 0.1 - Nov 12 1998 - Created now!
# 0.2 - Nov 18 2002 - Enhanced. Removed URL support, use only stdin.
#
$in="";
if($ARGV[0] eq "-h") {
print "Usage: $0 < HTML\n";
exit;
}
sub namevalue {
my ($tag)=@_;
my $name=$tag;
if($name =~ /name *=/i) {
if($name =~ /name *= *([^\"\']([^ \">]*))/i) {
$name = $1;
}
elsif($name =~ /name *= *(\"|\')([^\"\']*)(\"|\')/i) {
$name=$2;
}
else {
# there is a tag but we didn't find the contents
$name="[weird]";
}
}
else {
# no name given
$name="";
}
# get value tag
my $value= $tag;
if($value =~ /[^\.a-zA-Z0-9]value *=/i) {
if($value =~ /[^\.a-zA-Z0-9]value *= *([^\"\']([^ \">]*))/i) {
$value = $1;
}
elsif($value =~ /[^\.a-zA-Z0-9]value *= *(\"|\')([^\"\']*)(\"|\')/i) {
$value=$2;
}
else {
# there is a tag but we didn't find the contents
$value="[weird]";
}
}
else {
$value="";
}
return ($name, $value);
}
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if(!$form &&
($tag =~ /^< *form/i )) {
$method= $tag;
if($method =~ /method *=/i) {
$method=~ s/.*method *= *(\"|)([^ \">]*).*/$2/gi;
}
else {
$method="get"; # default method
}
$action= $tag;
$action=~ s/.*action *= *(\'|\"|)([^ \"\'>]*).*/$2/gi;
$method=uc($method);
$enctype=$tag;
if ($enctype =~ /enctype *=/) {
$enctype=~ s/.*enctype *= *(\'|\"|)([^ \"\'>]*).*/$2/gi;
if($enctype eq "multipart/form-data") {
$enctype="multipart form upload [use -F]"
}
$enctype = "\n--- type: $enctype";
}
else {
$enctype="";
}
print "--- FORM report. Uses $method to URL \"$action\"$enctype\n";
$form=1;
}
elsif($form &&
($tag =~ /< *\/form/i )) {
print "--- end of FORM\n";
$form=0;
if( 0 ) {
print "*** Fill in all or any of these: (default assigns may be shown)\n";
for(@vars) {
$var = $_;
$def = $value{$var};
print "$var=$def\n";
}
print "*** Pick one of these:\n";
for(@alts) {
print "$_\n";
}
}
undef @vars;
undef @alts;
}
elsif($form &&
($tag =~ /^< *(input|select)/i)) {
$mtag = $1;
($name, $value)=namevalue($tag);
if($mtag =~ /select/i) {
print "Select: NAME=\"$name\"\n";
push @vars, "$name";
$select = 1;
}
else {
$type=$tag;
if($type =~ /type *=/i) {
$type =~ s/.*type *= *(\'|\"|)([^ \"\'>]*).*/$2/gi;
}
else {
$type="text"; # default type
}
$type=uc($type);
if(lc($type) eq "reset") {
# reset types are for UI only, ignore.
}
elsif($name eq "") {
# let's read the value parameter
print "Button: \"$value\" ($type)\n";
push @alts, "$value";
}
else {
print "Input: NAME=\"$name\"";
if($value ne "") {
print " VALUE=\"$value\"";
}
print " ($type)\n";
push @vars, "$name";
# store default value:
$value{$name}=$value;
}
}
}
elsif($form &&
($tag =~ /^< *textarea/i)) {
my ($name, $value)=namevalue($tag);
print "Textarea: NAME=\"$name\"\n";
}
elsif($select) {
if($tag =~ /^< *\/ *select/i) {
print "[end of select]\n";
$select = 0;
}
elsif($tag =~ /[^\/] *option/i ) {
my ($name, $value)=namevalue($tag);
my $s;
if($tag =~ /selected/i) {
$s= " (SELECTED)";
}
print " Option VALUE=\"$value\"$s\n";
}
}
}
}

View File

@ -1,261 +0,0 @@
#!@PERL@
#
# getlinks.pl
#
# This script extracts all links from a HTML page, compares them to a pattern
# entered on the command line and then downloads matching links into the
# target dir (also specified on the command line).
#
# Written to use 'curl' for URL fetching, uses the source file names in the
# target directory.
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Version: 0.1 Oct 7, 1998
#
# HISTORY
#
# 0.1 - Created now!
#
$in="";
argv:
if($ARGV[0] eq "-v" ) {
$verbose = 1;
shift @ARGV;
goto argv;
}
if($ARGV[0] eq "-d" ) {
$display = 1;
shift @ARGV;
goto argv;
}
elsif($ARGV[0] eq "-h" ) {
$help = 1;
shift @ARGV;
goto argv;
}
$geturl = $ARGV[0];
$getdir = $ARGV[1];
$getregex = $ARGV[2];
if(($geturl eq "") ||
(($getdir eq "") && !$display) ||
$help) {
print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
" Use a traling slash for directory URLs!\n",
" Use \"quotes\" around the regex!\n",
" -h This help text\n",
" -d Display matches only instead of downloading\n",
" -v Verbose mode\n";
exit;
}
# change to target directory:
chdir $getdir ||
die "couldn't cd into $getdir";
# This is necessary from where I tried this:
#$proxy =" -x 194.237.142.41:80";
# linkchecker, URL will be appended to the right of this command line
# this is the one using HEAD:
$linkcheck = "curl -s -m 20 -I$proxy";
# as a second attempt, this will be used. This is not using HEAD but will
# get the whole frigging document!
$linkcheckfull = "curl -s -m 20 -i$proxy";
# htmlget, URL will be appended to the right of this command line
$htmlget = "curl -s$proxy";
# urlget, URL will be appended to the right of this command line
# this stores the file with the remote file name in the current dir
$urlget = "curl -O -s$proxy";
# Parse the input URL and split it into the relevant parts:
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = $3;
$getdocument = "";
if($getpath !~ /\//) {
$getpath ="";
$getdocument = $3;
}
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
$getprotocol = $1;
$getserver = $2;
$getpath = "";
$getdocument = "";
}
else {
print "Couldn't parse the specified URL, retry please!\n";
exit;
}
}
&SplitURL($geturl);
#print "protocol = $getprotocol\n";
#print "server = $getserver\n";
#print "path = $getpath\n";
#print "document = $getdocument\n";
#exit;
if(!$usestdin) {
open(HEADGET, "$linkcheck $geturl|") ||
die "Couldn't get web page for some reason";
headget:
while(<HEADGET>) {
# print $_;
if($_ =~ /HTTP\/.*3\d\d /) {
$pagemoved=1;
}
elsif($pagemoved &&
($_ =~ /^Location: (.*)/)) {
$geturl = $1;
&SplitURL($geturl);
$pagemoved++;
last headget;
}
}
close(HEADGET);
if($pagemoved == 1) {
print "Page is moved but we don't know where. Did you forget the ",
"traling slash?\n";
exit;
}
open(WEBGET, "$htmlget $geturl|") ||
die "Couldn't get web page for some reason";
while(<WEBGET>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
# print $line."\n";
$in=$in.$line;
}
close(WEBGET);
}
else {
while(<STDIN>) {
$line = $_;
push @indoc, $line;
$line=~ s/\n//g;
$line=~ s/\r//g;
$in=$in.$line;
}
}
sub GetLinks {
my $in = $_[0];
my @result;
getlinkloop:
while($in =~ /[^<]*(<[^>]+>)/g ) {
# we have a tag in $1
$tag = $1;
if($tag =~ /^<!--/) {
# this is a comment tag, ignore it
}
else {
if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
$url=$2;
if($url =~ /^\"(.*)\"$/) {
# this was a "string" now $1 has removed the quotes:
$url=$1;
}
$url =~ s/([^\#]*)\#.*/$1/g;
if($url eq "") {
# if the link was nothing than a #-link it may now have
# been emptied completely so then we skip the rest
next getlinkloop;
}
if($done{$url}) {
# if this url already is done, do next
$done{$url}++;
next getlinkloop;
}
$done{$url} = 1; # this is "done"
push @result, $url;
if($tag =~ /< *([^ ]+)/) {
# print "TAG: $1\n";
$tagtype{$url}=$1;
}
}
}
}
return @result;
}
@links = &GetLinks($in);
linkloop:
for(@links) {
$url = $_;
if($url =~ /^([^:]+):/) {
$link = $url;
}
else {
# this is an absolute link on the same server:
if($url =~ /^\//) {
# from root
$link = "$getprotocol://$getserver$url";
}
else {
# from the scanned page's dir
$nyurl=$url;
if(length($getpath) &&
($getpath !~ /\/$/) &&
($nyurl !~ /^\//)) {
# lacks ending slash, add one to the document part:
$nyurl = "/".$nyurl;
}
$link = "$getprotocol://$getserver/$getpath$nyurl";
}
}
if($link =~ /$getregex/) {
if($display) {
print "$link\n";
}
else {
if($verbose) {
print "Gets $link\n";
}
print `$urlget $link`;
}
}
}

View File

@ -1,104 +0,0 @@
#!/usr/bin/perl
#
# Author: Daniel Stenberg <daniel@haxx.se>
# Version: 0.1
# Date: October 10, 2000
#
# This is public domain. Feel free to do whatever you please with this script.
# There are no warranties whatsoever! It might work, it might ruin your hard
# disk. Use this on your own risk.
#
# PURPOSE
#
# This script uses a local directory to maintain a "mirror" of the curl
# packages listed in the remote curl web sites package list. Files present in
# the local directory that aren't present in the remote list will be removed.
# Files that are present in the remote list but not in the local directory
# will be downloaded and put there. Files present at both places will not
# be touched.
#
# WARNING: don't put other files in the mirror directory, they will be removed
# when this script runs if they don't exist in the remote package list!
#
# this is the directory to keep all the mirrored curl files in:
$some_dir = $ARGV[0];
if( ! -d $some_dir ) {
print "$some_dir is not a dir!\n";
exit;
}
# path to the curl binary
$curl = "/home/danste/bin/curl";
# this is the remote file list
$filelist = "http://curl.haxx.se/download/curldist.txt";
# prepend URL:
$prepend = "http://curl.haxx.se/download";
opendir(DIR, $some_dir) || die "can't opendir $some_dir: $!";
@existing = grep { /^[^\.]/ } readdir(DIR);
closedir DIR;
$LOCAL_FILE = 1;
$REMOTE_FILE = 2;
# create a hash array
for(@existing) {
$allfiles{$_} |= $LOCAL_FILE;
}
# get remote file list
print "Getting file list from $filelist\n";
@remotefiles=`$curl -s $filelist`;
# fill in the hash array
for(@remotefiles) {
chomp;
$allfiles{$_} |= $REMOTE_FILE;
$remote++;
}
if($remote < 10) {
print "There's something wrong. The remote file list seems too smallish!\n";
exit;
}
@sfiles = sort { $a cmp $b } keys %allfiles;
$leftalone = $downloaded = $removed = 0;
for(@sfiles) {
$file = $_;
$info = $allfiles{$file};
if($info == ($REMOTE_FILE|$LOCAL_FILE)) {
print "$file is LOCAL and REMOTE, left alone\n";
$leftalone++;
}
elsif($info == $REMOTE_FILE) {
print "$file is only REMOTE, getting it...\n";
system("$curl $prepend/$file -o $some_dir/$file");
$downloaded++;
}
elsif($info == $LOCAL_FILE) {
print "$file is only LOCAL, removing it...\n";
system("rm $some_dir/$file");
$removed++;
}
else {
print "Problem, file $file was marked $info\n";
}
$loops++;
}
if(!$loops) {
print "No remote or local files were found!\n";
exit;
}
print "$leftalone files were already present\n",
"$downloaded files were added\n",
"$removed files were removed\n";

View File

@ -1,67 +0,0 @@
#!@PERL@
#
# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
# Date: August 25 1998
# Version: 0.1
#
# This is just meant as an example of why we wrote curl in the first place.
# Quick n' easy scripting use.
#
$dir = $ARGV[0];
$target = $ARGV[1];
$maxdepth = $ARGV[2];
if($dir eq "" || $target eq "") {
print "Usage: <URL> <dir> [max depth level] \n";
print " End the URL with a slash if a directory is specified, please\n";
exit;
}
if(($maxdepth ne "") && ($maxdepth == 0)) {
# reached maximum depth, die
print "Reached maximum recursive depth level ($maxdepth), exiting...\n";
exit;
}
# get dir
@all = `curl -s $dir`;
if($all[0] ne "") {
print "Got the main $dir dir\n";
}
line:
for(@all) {
chop; # cut off newline
@linep= split(" ", $_);
$name = $linep[$#linep];
$firstletter=substr($linep[0], 0, 1);
if($firstletter eq "d") {
# this is a subdir, recurse
# if not . or .. of course
if(($name eq ".") || ($name eq "..")) {
next line;
}
print "Recursing for dir $dir$name in target $target/$name\n";
$nextdepth=$maxdepth-1;
print `$0 $dir$name/ $target/$name $nextdepth`;
}
elsif($firstletter eq "-") {
# this is a file, get it
# oh, make sure the target dir exists first
if(! -r $target ) {
mkdir($target,0777);
}
print "Getting file $dir$name in target $target/$name\n";
print `curl -s $dir$name >$target/$name`;
}
}