#  ---------------------------------------------------------------------
#   ODL/OAI Harvester
#    v1.1
#    January 2002
#  ------------------+--------------------+-----------------------------
#   Hussein Suleman  |   hussein@vt.edu   |    www.husseinsspace.com    
#  ------------------+--------------------+-+---------------------------
#   Department of Computer Science          |        www.cs.vt.edu       
#     Digital Library Research Laboratory   |       www.dlib.vt.edu      
#  -----------------------------------------+-------------+-------------
#   Virginia Polytechnic Institute and State University   |  www.vt.edu  
#  -------------------------------------------------------+-------------


package XOAI::Harvester;


use Pure::EZXML;
use Pure::LockTS;
use XOAI::XOAISP;


# constructor
sub new
{
   my ($classname) = @_;

   my $self = {
      class           => $classname,

      # global defaults
      overlap         => 172800,
      interrequestgap => 15,
      granularity     => 'day',
   };
   
   bless $self, $classname;
   return $self;
}


# convert UTC time to ISO8601 format
sub toISO8601
{
   my ($self, $atime, $granularity) = @_;
   
   my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime ($atime);
   if ((defined $granularity) && ($granularity eq 'second'))
   {
      sprintf ("%04d-%02d-%02dT%02d:%02d:%02d+00:00", $year+1900, $mon+1, $mday, $hour, $min, $sec);
   }
   else
   {
      sprintf ("%04d-%02d-%02d", $year+1900, $mon+1, $mday);
   }
}


# harvest OAI/ODL data based on configuration and parameters
#  harvest expects an archive parameter in the form:
#  [
#   { identifier => 'somename',
#     url        => 'someurl',
#     interval   => harvestinggranularityinseconds
#     metadataPrefix => someprefix(multiple?)
#     set        => someset,
#     overlap    => harvestingoverlapinseconds
#     interrequestgap => inseconds
#   },
#   { ... }
#  ]
#
sub harvest
{
   my ($self, $archivelist, $when, $what, $fromwhere) = @_;
   
   # set default values for parameters
   if ((! defined $fromwhere) || ($fromwhere ne 'start'))
   {
      $fromwhere = 'last';
   }
   if (!defined $what)
   {
      $what = 'all';
   }
   if ((! defined $when) || ($when ne 'now'))
   {
      $when = 'schedule';
   }

   # filter archives that are not selected by the $what parameter
   my @filteredarchivelist = ();
   foreach my $archive (@$archivelist)
   {
      if (($what eq 'all') || ($archive->{'identifier'}->[0] eq $what))
      {
         push (@filteredarchivelist, $archive);
      }
   }
   
   # iterate over archives in filter list
   foreach my $archive (@filteredarchivelist)
   {
      my $outputheader = 0;

      # get configuration variables
      my $archiveidentifier = $archive->{'identifier'}->[0];
      my $archiveurl = $archive->{'url'}->[0];
      my $archiveinterval = $archive->{'interval'}->[0];
      my @archivemetadataPrefixes = @{$archive->{'metadataPrefix'}};
      my $archiveset = '';
      if (exists $archive->{'set'})
      {
         $archiveset = $archive->{'set'}->[0];
      }
      my $archiveoverlap = $self->{overlap};
      if (exists $archive->{'overlap'})
      {
         $archiveoverlap = $archive->{'overlap'}->[0];
      }
      my $archiveinterrequestgap = $self->{interrequestgap};
      if (exists $archive->{'interrequestgap'})
      {
         $archiveinterrequestgap = $archive->{'interrequestgap'}->[0];
      }
      my $archivegranularity = $self->{granularity};
      if (exists $archive->{'granularity'})
      {
         $archivegranularity = $archive->{'granularity'}->[0];
      }
            
      # create unique token for this harvesting operation - lock the archive
      my $ts = new Pure::LockTS ("data/$archiveidentifier.lock");

      # use non-blocking test-and-set to avoid overlapping harvesting
      if ($ts->test == 1)
      {
         # get from and until dates
         my $archivemetadataPrefix = 'all';
         if ($#archivemetadataPrefixes == 0)
         {
            $archivemetadataPrefix = $archivemetadataPrefixes[0];
         }
      
         # read in from date for this archive and metadataPrefix
         my $fromdate;
         if ((-e "data/$archiveidentifier.$archivemetadataPrefix.date") &&
             ($fromwhere eq 'last'))
         {
            open (FILE, "<data/$archiveidentifier.$archivemetadataPrefix.date");
            $fromdate = <FILE>;
            close (FILE);
            chomp $fromdate;
         }
         else
         {
            $fromdate = 315550800; # 1 jan 1980
         }
         
         my $untildate = time;
         
         # check if new harvest is required or forced
         if ((($fromdate + $archiveinterval) < $untildate) ||
             ($when eq 'now'))
         {
            # enforce overlap to cater for timezones and granularity
            if (($untildate - $fromdate) < $archiveoverlap)
            {
               $fromdate = $untildate - $archiveoverlap;
            }
            
            # build request;
            my $oai = new XOAI::XOAISP ($archiveurl);
            my $verb = '';
            my %parameters = ();
            if ($#archivemetadataPrefixes == 0)
            {
               $verb = 'ListRecords';
               $parameters{'metadataPrefix'} = $archivemetadataPrefix;
            }
            else
            {
               $verb = 'ListIdentifiers';
            }
            if ($fromdate != 315550800)
            {
               $parameters{'from'} = $self->toISO8601 ($fromdate, $archivegranularity);
            }
            $parameters{'until'} = $self->toISO8601 ($untildate, $archivegranularity);
            if ($archiveset ne '')
            {
               $parameters{'set'} = $archiveset;
            }

            # submit first request
            my $res = $oai->doXOAI ($verb, \%parameters);
            
            # halt on errors
            if ($res->code != 200)
            {
               $ts->clear;
               die "HTTP Error connecting to OAI data provider $archiveidentifier/$archivemetadataPrefix\n".
                   $res->code.' '.$res->message;
            }
      
            # parse and process the XML
            my $gotresumptiontoken;
            do {
               my $parser = new Pure::EZXML;
               my $doc = $parser->parse ($res->content);
               
               # using ListRecords
               if ($#archivemetadataPrefixes == 0)
               {               
                  for my $record ($doc->getDocumentElement->getElementsByTagName ("record"))
                  {
                     if ($outputheader == 0)
                     {
                        $outputheader = 1;
                        $self->processHarvestStart ($archiveidentifier, $archiveurl,
                                  $archiveinterval, \@archivemetadataPrefixes,
                                  $archiveset, $archiveoverlap,
                                  $archiveinterrequestgap, $archivegranularity);
                     }
                     $self->processIdStart ($record->getElementsByTagName ("header")
                                       ->item(0)->getElementsByTagName ("identifier")
                                       ->item(0)->getFirstChild->getNodeValue,
                                       $record->getAttribute ('status'));
                     $self->processRecord ($record, $untildate, $archivemetadataPrefix, $archiveset, $archiveidentifier);
                     $self->processIdEnd ($record->getElementsByTagName ("header")
                                       ->item(0)->getElementsByTagName ("identifier")
                                       ->item(0)->getFirstChild->getNodeValue);
                  }
               }
               # using ListIdentifiers / GetRecord(s)
               else
               {
                  for my $recordid ($doc->getDocumentElement->getElementsByTagName ("identifier"))
                  {
                     if ($outputheader == 0)
                     {
                        $outputheader = 1;
                        $self->processHarvestStart ($archiveidentifier, $archiveurl,
                                  $archiveinterval, \@archivemetadataPrefixes,
                                  $archiveset, $archiveoverlap,
                                  $archiveinterrequestgap, $archivegranularity);
                     }
                     my $identifier = $recordid->getFirstChild->getNodeValue;
                     $self->processIdStart ($identifier, $recordid->getAttribute ('status'));
                     foreach my $ametadataPrefix (@archivemetadataPrefixes)
                     {
                        # submit request
                        my $res = $oai->doXOAI ('GetRecord', 
                                                { metadataPrefix => $ametadataPrefix,
                                                  identifier => $identifier }
                                               );
            
                        # halt on errors
                        if ($res->code != 200)
                        {
                           $ts->clear;
                           die "HTTP Error connecting to OAI data provider $archiveidentifier/$archivemetadataPrefix\n".
                               $res->code.' '.$res->message;
                        }
                     
                        my $doc = $parser->parse ($res->content);
                        for my $record ($doc->getDocumentElement->getElementsByTagName ("record"))
                        {
                           $self->processRecord ($record, $untildate, $ametadataPrefix, $archiveset, $archiveidentifier);
                        }
                        $doc->dispose;
                     }
                     $self->processIdEnd ($identifier);
                  }
               }         
         
               # if resumption token exists, get more data
               my $resumptiontokenNodeList = $doc->getDocumentElement->getElementsByTagName ("resumptionToken");
               if ($resumptiontokenNodeList->getLength == 1)
               {
                  my $resumptiontoken = $resumptiontokenNodeList->item(0)->getFirstChild->getNodeValue;
                  $resumptiontoken =~ s/^([\s\n\r\t]*)([^\s\n\r\t]*)([\s\n\r\t]*)$/$2/;
                  $gotresumptiontoken = 1;
                  sleep ($archiveinterrequestgap);
                  if ($#archivemetadataPrefixes == 0)
                  {
                     $verb = 'ListRecords';
                  }
                  else
                  {
                     $verb = 'ListIdentifiers';
                  }
                  $res = $oai->doXOAI ($verb, { resumptionToken => $resumptiontoken } );
                  
                  # halt on errors
                  if ($res->code != 200)
                  {
                     $ts->clear;
                     die "HTTP Error connecting to OAI data provider $archiveidentifier/$archivemetadataPrefix\n".
                         $res->code.' '.$res->message;
                  }
               }
               else
               {
                  $gotresumptiontoken = 0;
               }
         
               $doc->dispose;
            } while ($gotresumptiontoken > 0);
            
            # save current date
            open (FILE, ">data/$archiveidentifier.$archivemetadataPrefix.date");
            print FILE "$untildate\n";
            close (FILE);

         }

         $ts->clear;
      }
   
      if ($outputheader == 1)
      {
         $self->processHarvestEnd;
      }
   }
}


# stub routine for beginning of harvest run
sub processHarvestStart
{
   my ($self, $archiveidentifier, $archiveurl, $archiveinterval, 
       $archivemetadataPrefixes, $archiveset, $archiveoverlap,
       $archiveinterrequestgap, $archivegranularity) = @_;
   print "\nHarvesting:\n";
   print "  archive         = $archiveidentifier\n";
   print "  url             = $archiveurl\n";
   print "  interval        = $archiveinterval days\n";
   foreach my $archivemetadataPrefix (@$archivemetadataPrefixes)
   {
      print "  metadataPrefix  = $archivemetadataPrefix\n";
   }
   print "  set             = $archiveset\n";
   print "  overlap         = $archiveoverlap\n";
   print "  interrequestgap = $archiveinterrequestgap\n\n";
   print "  granularity     = $archivegranularity\n\n";
}


# stub routine for end of harvest run
sub processHarvestEnd
{
   print "\ndone.\n";
}


# stub routine for beginning of processing for an id
sub processIdStart
{
   my ($self, $identifier, $status) = @_;
   
   print "$identifier, $status\n";
}


# stub routine for end of processing for an id
sub processIdEnd
{
   my ($self, $identifier) = @_;
   
   print "/$identifier\n";
}


# stub routine to process an XML record
sub processRecord
{
   my ($self, $xml, $date, $metadataPrefix, $set, $archiveid) = @_;
   
   my $identifier = $xml->getElementsByTagName ("header")
                        ->item(0)->getElementsByTagName ("identifier")
                        ->item(0)->getFirstChild->getNodeValue;
   while ($identifier =~ /^\s(.*)$/s) { $identifier = $1; }
   while ($identifier =~ /^(.*)\s$/s) { $identifier = $1; }

   print "$identifier in $metadataPrefix on ".$self->toISO8601 ($date, 'second')."\n";
}


1;



=head1 NAME

Harvester - Multi-purpose OAI Harvester

=head1 SYNOPSIS

  use Harvester;
 
  my $h = new Harvester ('configfile');
  
  $h->harvest;

=head1 DESCRIPTION

Harvest data periodically from an OAI data source. The 
harvester ought to be called periodically using a scheduler 
such as I<cron>.

The configuration file contains an XML description of the 
harvesting schedule such as the following:

  <?xml version="1.0" encoding="utf-8" ?>

  <config>

   <archive>
      <identifier>TEST</identifier>
      <url>http://www.test.org/cgi-bin/OAI.pl</url>
      <metadataPrefix>oai_dc</metadataPrefix>
      <interval>86400</interval>
      <set>All</set>
      <overlap>172800</overlap>
      <interrequestgap>10</interrequestgap>
   </archive>
   
  </config>
  
The above example will get Dublin Core metadata from the 
TEST archive at the most every 0.5 days (depending on how 
often harvesting is scheduled).

I<archive> is a repeatable element. I<set> specifies a set 
to harvest from. I<overlap> indicates the number of 
overlapping seconds for harvesting and defaults to 172800. 
I<interrequestgap> specifies the number of seconds between 
resumption requests and defaults to 600. These three 
parameters are optional but the rest are required.

I<metadataPrefix> is repeatable within a single I<archive>
element. If there is more than one entry, the Harvester will
request the identifiers and then obtain all metadata for
each record in turn. With a single I<metadataPrefix> the
Harvester uses the more efficient I<ListRecords> service
request to get metadata for multiple records in each request.
For greater speed and lower network usage, it is better to
have two I<archive> sections for different metadata formats
rather than include them in a single I<archive> section 
(the latter leads to finer granularity of consistency if this 
is the overriding factor).

The name I<config> is ignored so can be changed if desired.

The harvester works by calling five functions that are meant 
to be overridden in a descendent class: I<processHarvestStart>
is called at the beginning of harvesting a set of records and
I<processHarvestEnd> is called at the end of a harvest;
I<processIdStart> is called before any identifier is 
processed and I<processIdEnd> is called afterwards; and 
I<processRecord> is called once for each metadata record.
 
In the last case, an I<EZXML> "record" subtree is 
passed in for processing.

The Harvester uses the I<data> subdirectory to store dates
for each archive being harvested as well as lock files. These
lock files will prevent overlapped harvesting of the same
archive, and if something goes wrong and the program
crashes, will prevent recurrences until the lock file
is manually removed.

=head1 METHODS

=over 4

=item new (config_filename)

Creates a new harvester using the specified filename as a 
schedule configuration.

=item harvest (force)

Harvest metadata from all archives as specified in the 
configuration file. If this function is called many times 
by different instances, locking of individual archives 
allows different instances to work on different archives.

<force> is an optional parameter. If present and it has a
value of 1 it forces harvesting irrespective of schedule.
If present and it has a value of 2 it causes a fresh
harvest of the entire archive.

=item processHarvestStart (identifer,url,interval,
metadataPrefixes,set,overlap,interrequestgap)

Callback function to handle configuration information for 
archive being harvested (by default this is output to 
STDOUT). The parameters correspond to an entry in the 
configuration file - all are strings except 
I<metadataPrefixes> which is a reference to a list of 
strings.

=item processHarvestEnd ()

Callback function called after harvesting has completed.

=item processIdStart (identifier,status)

Callback function called just before metadata for a record
is processed.

=item processIdEnd (identifier)

Callback function called after all metadata for a record
has been processed.

=item processRecord (xml,date,metadataPrefix,set,archiveid)

Callback function to handle "record" XML fragment, passed in 
as an I<EZXML Node>. The I<date> refers to the date of 
harvesting and the I<metadataPrefix> and I<set> are the 
values sent to the data provider. I<archiveid> is a unique 
identifier for the archive.

=back

=head1 NOTES

This is part of the larger effort to build componentized digital libraries.

=head1 CAVEATS

Of course this is largely  untested :)

=head1 BUGS

None that I found yet.

=head1 AUTHOR

Hussein Suleman <hussein@vt.edu>

=head1 HISTORY

This is the first release version.

=cut

