#!/usr/bin/perl -w use strict; use Getopt::Long; use File::Find; use LWP::Simple; use HTML::LinkExtor; use URI::URL; my (%options, ); use vars qw($directory $extension $frequency $depth $ignore $keywords $logfile $output $verbose $help $recurse @words $words $URL %count $parser %Docs $p $host ); my ($word); Getopt::Long::Configure("bundling"); GetOptions( "h+" => \$help, "v+" => \$verbose, "r+" => \$recurse, # For file searches "f+" => \$frequency, "H+" => \$host, "d:s" => \$directory, "e:s" => \$extension, "l:s" => \$logfile, "o:s" => \$output, "D:i" => \$depth, # For web searches ); $help && Usage(); @words = @ARGV; chomp @words; Usage() unless @words; $words = join '|', @words; if ($logfile) { open (LOG, ">$logfile") || die("Could not open $logfile for writing: $!\n"); }# End if if ($directory) { # Search local file system $directory =~ s!/$!!; $verbose && print "Searching $directory\n"; $logfile && print LOG "Searching $directory\n"; if ($recurse) { find(\&search, $directory); } else { opendir (DIR, $directory) || die "Could not read $directory: $!\n"; my @files = readdir DIR; closedir DIR; @files = grep (-T "$directory/$_", @files); if ($extension) { $extension =~ s!\.!\\.!g; @files = grep (/$extension$/, @files) ; } $verbose && print "Found files: ", (join ", ", @files), "\n"; search("$directory/$_") for @files; } # End else } elsif (($URL = shift @words) =~ m!^(ht|f)tp://!) { # Search WWW $p = HTML::LinkExtor->new(); if ($host) { ($host = $URL) =~ s!((.*?)//(.*?))/.*$!$1!; $verbose && print "Restricting queries to pages on $host\n"; } searchpage(0, $URL); for (keys %Docs) { $verbose && print "Looked at $_\n"; } } else { Usage(); } # Results if ($output) { open (OUT, ">$output") || die ("Could not open $output for writing: $!\n"); } # End if if ($frequency) { # With -f for (sort keys %count) { # The keys of %count are file names for $word (sort keys %{$count{$_}}) { if ($output) { print OUT "$_|$word|$count{$_}{$word}\n"; $verbose && print"$_|$word|$count{$_}{$word}\n"; } else { print "$_|$word|$count{$_}{$word}\n"; } # End if..else ($output) } # End for words } # End for files } else { # Without -f for (sort keys %count) { if ($output) { print OUT "$_|"; $verbose && print "$_|"; } else { print "$_|"; } # End else my $found = join ',', sort (keys %{$count{$_}}); if ($output) { print OUT "$found\n"; $verbose && print "$found\n"; } else { print "$found\n"; } } # End for files } # End (not -f) close LOG if $logfile; close OUT if $output; sub search { # Search a text file for a string my $file = $File::Find::name || shift; -B $file && return; my ($line); $verbose && print "Searching $file\n"; open (FILE, $file); while ($line = ) { if ($line =~/$words/) { $verbose && print " $.:\t$line"; map{$count{$file}{$_}++ if $line=~/$_/}@words; } } close FILE; } # End sub search sub searchpage { my ($cur_depth, $url) = @_; my ($link, @links, $abs); $verbose && print "Looking at $url, at depth $cur_depth\n"; $Docs{$url} = 1; return(0) if ($cur_depth > $depth); my $content = get($url); if ($content=~m/$words/is) { map{my $tmp=($content=~s/($_)/$1/gis); $count{$url}{$_}=$tmp if $tmp}@words; } # End if $p->parse($content); @links = $p->links; for $link (@links) { $abs = url($link->[2], $url)->abs if ($link->[0] eq 'a' && $link->[1] eq 'href'); $abs =~ s/#.*$//; $abs =~ s!/$!!; # Skip some URLs next if $abs=~/^mailto/i; next if $abs=~/(gz|zip|exe|tar|Z)$/; next unless $abs; next unless ($abs =~ /^$host/); next if $abs=~/\?\S+?=\S+/; searchpage($cur_depth+1, $abs) unless ($Docs{$abs} || ($cur_depth+1 > $depth)); } } # End sub searchpage sub Usage { print <, C, C and C =head1 COREQUISITES None =head1 README Simple command-line tool for searching either a web site, or a local directory, for documents containing particular keyword(s). =pod OSNAMES Any =pod SCRIPT CATEGORIES Search =head1 Author Written by Rich Bowen for The Creative Group () =cut