#!/usr/bin/perl -w # # Copyright (C) 2007, Joshua D. Abraham (jabra@spl0it.org) # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # use strict; # # rip_links.pl - extracts links from a webpage # # ex: ./rip_links.pl -u http://www.digg.com # use strict; use Getopt::Long; use HTML::SimpleLinkExtor; use vars qw( $PROG ); ( $PROG = $0 ) =~ s/^.*[\/\\]//; # Truncate calling path from the prog name my $AUTH = 'Joshua D. Abraham'; # author my $EMAIL = 'jabra@spl0it.org'; # email my $VERSION = '1.00'; # version my %options; # getopt option hash my $concat; my $scheme ; my $extor = HTML::SimpleLinkExtor->new(); # # help: # display help information # sub help { print "Usage: $PROG [Input Option] [Option] -u --url Parse url and extract links -f --file Parse file and extract links --all Print all Links (Relative and Absolute) -a --authority Print only the scheme and authority(Only Absolute) -c --concat [string] Concatenate string onto end of link(Only Absolute) -s --scheme Print the links of specific scheme(Only Absolute) -r --relative Print all the Relative Links -v --version Display version -h --help Display this information Send Comments to $AUTH ( $EMAIL )\n"; exit; } # # print_version: # displays version # sub print_version { print "$PROG version $VERSION by $AUTH ( $EMAIL )\n"; exit; } # # print_authority # prints the scheme and authority for a URI # sub print_authority { my ($uri) = @_; my ($scheme, $authority, $path, $query, $fragment) = $uri =~ m/(?:([^:\/?#]+):)?(?:\/\/([^\/?\#]*))?([^?#]*)(?:\?([^#]*))?(?:\#(.*))?/; print $scheme . '://' . $authority . "\n"; } if ( @ARGV == 0 ) { help; exit; } GetOptions( \%options, 'file|f=s', 'url|u=s', 'all','concat|c=s', 'authority|a','scheme|s=s','relative|r', 'help|h' => sub { help(); }, 'version|v' => sub { print_version(); }, ) or exit 1; if ( $options{file} and -r $options{file} ) { $extor->parse_file( $options{file} ); } elsif ( $options{url} ) { $extor->parse_url( $options{url} ); } else { print "Error: Input type not set\n"; help(); } my @links; if ( $options{all} ) { @links = $extor->links(); } elsif ( $options{relative} ){ @links = $extor->relative_links; } elsif ( $options{scheme} ) { $options{scheme} =~ s/:\/\///g; @links = $extor->schemes( $scheme ); } else { @links = $extor->schemes( qw( http https ) ) } if ($options{authority}){ foreach (@links){ print_authority($_); } } elsif ( $options{concat} ) { $concat = $options{concat}; foreach (@links){ print "$_$concat\n"; } } else { foreach(@links){ print "$_\n"; } }