#!/usr/bin/perl
# get dump of HTML files
my $host = 'simba.cs.uct.ac.za';
my $starturl = 'http://'.$host.'/localdocs';
my $levels = 3;
system ('wget -l'.$levels.' -r -k -E -A.html '.$starturl);
# clean up files
sub cleandir
{
my ($directory) = @_;
opendir (my $dir, $directory);
my @files = readdir ($dir);
closedir ($dir);
foreach my $afile (grep { /[a-zA-Z0-9]/ } @files)
{
print $directory.'/'.$afile."\n";
if (-d $directory.'/'.$afile)
{
cleandir ($directory.'/'.$afile);
}
else
{
print 'Cleaning: '.$directory.'/'.$afile."\n";
system ('tidy -asxml -q -m -n -u '.$directory.'/'.$afile);
my $res = system ('xmllint --noout '.$directory.'/'.$afile);
if ($res > 0)
{ unlink $directory.'/'.$afile; }
}
}
}
cleandir ($host);