#!/usr/bin/perl # get dump of HTML files my $host = 'simba.cs.uct.ac.za'; my $starturl = 'http://'.$host.'/localdocs'; my $levels = 3; system ('wget -l'.$levels.' -r -k -E -A.html '.$starturl); # clean up files sub cleandir { my ($directory) = @_; opendir (my $dir, $directory); my @files = readdir ($dir); closedir ($dir); foreach my $afile (grep { /[a-zA-Z0-9]/ } @files) { print $directory.'/'.$afile."\n"; if (-d $directory.'/'.$afile) { cleandir ($directory.'/'.$afile); } else { print 'Cleaning: '.$directory.'/'.$afile."\n"; system ('tidy -asxml -q -m -n -u '.$directory.'/'.$afile); my $res = system ('xmllint --noout '.$directory.'/'.$afile); if ($res > 0) { unlink $directory.'/'.$afile; } } } } cleandir ($host);