#!/usr/bin/perl
# Grok itineraries from Navigant; i.e. turn them
# into RDF/n3.
#
# Hmm... Navigant probably isn't the only travel agency to
# use this format; perhaps it's a SABRE thing?
#
# $Id: grokNavItin.pl,v 1.8 2002/06/04 01:45:38 connolly Exp $
# see changelog at end
#
# @@bookmark:
#Subject: ANN: RDF Sample, ICAO Airport Codes 
#
#     From: David Megginson <david@megginson.com> 
#     To: "XML Developers' List" <xml-dev@ic.ac.uk> 
#     Date: Fri, 30 Jul 1999 10:13:47 -0400 (EDT) 
# http://lists.xml.org/archives/xml-dev/199907/msg00415.html
# http://www.megginson.com/samples/rdf/airports.rdf

use strict;

my($Verbose) = 1;
$| = 1;

my($rdfNS) = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
my($paNS) = "http://www.w3.org/2000/08/palm56/addr#";
my($pdNS) = "http://www.w3.org/2000/08/palm56/datebook#";
my($dcNS) = "http://purl.org/dc/elements/1.1/";
my($navNS) = "http://www.w3.org/2001/07dc-bos/grokNavItin#";

my(%Things);

&bind("r", $rdfNS);
&bind("pa", $paNS);
&bind("pd", $pdNS);
&bind("dc", $dcNS);
&bind("nav", $navNS);



my($gen) = 1;

my(%monthNameToNum);
%monthNameToNum = ('JAN', 1,
		   'FEB', 2,
		   'MAR', 3,
		   'APR', 4,
		   'MAY', 5,
		   'JUN', 6,
		   'JUL', 7,
		   'AUG', 8,
		   'SEP', 9,
		   'OCT', 10,
		   'NOV', 11,
		   'DEC', 12
		  );


&slurp();
#@@ &burp();

sub slurp{
  my($state) = 'start';
  my($agency, $traveller);
  my($day);
  my($event);
  
  while(<>){

  REDO:
    if($state eq 'start'){

      #e.g.        617 451-4200 TEL
      if(/(\d\d\d) (\d\d\d-\d\d\d\d) TEL/){

	# ground the company in URI space via their main telephone number...
	$agency = genSym("agency");
	makeStatement("", $dcNS . "creator", $agency);
	makeStatement($agency, $paNS . "Main", sprintf("tel:+1-%s-%s", $1, $2));
	
      }


      #e.g. >  SALES PERSON: 53     ITINERARY            DATE: 15 JUN
      #     >01
      elsif(/ITINERARY\s+DATE: (\d\d) (\w+)/){
	my($dd, $mon) = ($1, $2);
	my($line);
	$line = <>;
	if($line =~ /(\d\d)/){
	  my($yy) = $1;

	  # consider saying this is a u:SubClassOf <http://www.cyc.com/cyc-2-1/vocab/transportation-vocab.html#ItineraryDocument>
	  makeStatement("", $rdfNS . "type", $navNS . "Itinerary");
	  makeStatement("", $dcNS . "date", '', &fmtDate($dd, $mon, $yy));
	}
	else{
	  warn "cannot find year.";
	}
      }

      #e.g. >  CUSTOMER NBR: 6160150001     TBZTQY         PAGE: 01
      elsif(m,CUSTOMER NBR: (\d+)\s+(\w+),){
	my($custNum, $recLoc) = ($1, $2);
	makeStatement("", $navNS . "customerNumber", '', $custNum);
	makeStatement("", $navNS . "sixLetters", '', $recLoc);
      }

      #e.g. >  FOR: CONNOLLY/DANIEL
      elsif(m,FOR:\s+(\w+)/(\w+),){
	my($fam, $given) = ($1, $2, $3);

	$traveller = genSym("traveller");

	makeStatement("", $navNS . "for", $traveller); # also say rdfs:label "FOR"?
	makeStatement($traveller, $paNS . "name", '', $fam);
	makeStatement($traveller, $paNS . "firstName", '', $given);
      }
      #e.g. >  REF: 6264400
      elsif(m,\s+REF: (\d+),){
	my($ref) = ($1, $2, $3);

	makeStatement("", $navNS . "ref", '', $ref);
      }


      #e.g. >  11 JUL 01  -  WEDNESDAY
      elsif(/(\d\d) (\w+) (\d\d)\s+-\s+([A-Z]+)/){
	my($dd, $mon, $yy, $dow) = ($1, $2, $3, $4);
	print STDERR "ala 11 JUL 01  -  WEDNESDAY: $dd, $mon, $yy, $dow\n";
	$day = fmtDate($dd, $mon, $yy);
	my($d);
	$d = genSym("day");
	$d = the($navNS . 'date', $day, "day");
	makeStatement($d, $navNS . 'dayName', '', $dow);
	
	$state = 'inDay';
      }

      #e.g. >  SUNDAY, 13 JANUARY
      elsif(/([SMTWF][A-Z]+), (\d\d) ([A-Z]+)/){
	my($dow, $dd, $mon)  = ($1, $2, $3);
	my($yy);
	$yy = 2; print STDERR "@@ kludged date to 2002. $dow $dd $mon\n";
	$day = fmtDate($dd, $mon, $yy);
	my($d);
	$d = the($navNS . 'date', $day, "day");
	makeStatement($d, $navNS . 'dayName', '', $dow);
	
	$state = 'inDay';
      }

      else{
	warn "state $state. skipping: $_" if $Verbose;
      }
    }

    elsif($state eq 'inDay'){


      #e.g. >     AIR   AMERICAN AIRLINES    FLT:1364   ECONOMY
      if(/\bAIR\b/ && /FLT:(\d+)\s+(\w+)/){
	my($flightNum, $flightClassName) = ($1, $2);
	s/FLT:.*//;
	s/\s*AIR\s*//;
	s/\s+$//;
	my($carrierName) = $_;

	$event = genSym("flt");
	makeStatement("", $navNS . "air", $event);
	makeStatement($event, $navNS . "date", '', $day);
	makeStatement($event, $navNS . "flightNum", '', $flightNum);

	my($carrier);

	# consider using
	# http://www.cyc.com/cyc-2-1/vocab/social-vocab.html#nameOfAgent
	# and http://www.cyc.com/cyc-2-1/vocab/transportation-vocab.html#AirlineCompany
	$carrier = the($paNS . 'company', $carrierName, 'company');
	makeStatement($event, $navNS . "carrier", $carrier);
	makeStatement($event, $navNS . "flightClassName", '', $flightClassName);
	$state = 'inEvent';
      }
      else{
	warn "inDay $day; unknown event type? $_" if $Verbose;
      }
    }
    elsif($state eq 'inEvent'){

      #e.g. >    LV KANSAS CITY INTL          144P           EQP: MD-80
      while(s/(LV|AR) ((\S|( [a-zA-Z]))+) \s*(\w\w)?\s*(\d\d?)(\d\d)(A|P)//){
	my($dir, $airportName, $st, $hh, $mm, $ap) = ($1, $2, $5, $6, $7, $8);
	$hh += 12 if $ap eq 'P';
	$hh = 0 if ($ap eq 'A' && $hh == 12);
	my($timePlace, $place);
	$timePlace = genSym("timePlace");
	$place = the($navNS . "airportName", $airportName, "airport");
	makeStatement($event, $navNS . $dir, $timePlace);
	makeStatement($timePlace, $navNS . "time", '',
		      sprintf("%02d:%02d", $hh, $mm));
	makeStatement($timePlace, $navNS . "place", $place);
      }

      #e.g. >       CONNOLLY/DANIEL   SEAT- 9B   AA-XDW5282
      if(/SEAT- *(\w+)/){
	my($seatName) = ($1);
	my($sa);
	$sa = genSym("seat");
	makeStatement($event, $navNS . "seat", $sa);
	makeStatement($sa, $navNS . "who", $traveller); # @@BUG: for >1 travellers
	makeStatement($sa, $navNS . "num", '', $seatName);
      }

      elsif(/\bAIR\b/){
	$state = 'inDay';
	goto REDO;
      }
      elsif(/(\d\d) (\w+) (\d\d)\s+-\s+(\w+)/ # 16 JAN 02 - WEDNESDAY
	   || /[SMTWF]\w+, \d\d? \w+/ # SUNDAY, 13 JANUARY
	   ){
	$state = 'start';
	goto REDO;
      }
      else{
	warn "state: $state not matched: $_"  if $Verbose;
      }
    }
    
    else{
      die "unknown state";
    }
  }
}


sub fmtDate{
  my($dd, $mon, $yy) = @_;
  $mon = substr($mon, 0, 3);
  my($mm);
  $mm = $monthNameToNum{$mon};
  die "bad month: $mon" unless $mm >= 1 && $mm <= 12;
  return sprintf("%04d-%02d-%02d",
		 2000+$yy, #@@BUG: y3k
		 $mm, $dd);
}

sub makeStatement{
  my($s, $p, $or, $ol) = @_;

  # keep existentials existential...
  $s = "<$s>" unless $s =~ /^_:/;
  $p = "<$p>" unless $p =~ /^_:/;
  
  if($or){
    $or = "<$or>" unless $or =~ /^_:/;
    print "$s $p $or.\n";
  }else{
    print "$s $p \"$ol\".\n"; #@@BUG: string quoting
  }
}

sub bind{
  my($pfx, $ns) = @_;
  printf("\@prefix %s: <%s>.\n", $pfx, $ns);
}

sub genSym{
  my($hint) = @_;
  
  $gen++;
  return "_:$hint$gen";
}


sub the{
  # this assumes $prop is a daml:UniqueProperty
  my($prop, $val, $hint) = @_;
  my($ret);

  $ret = $Things{$prop, $val};
  return $ret if $ret;
  $ret = genSym($hint);
  makeStatement($ret, $prop, '', $val);
  $Things{$prop, $val} = $ret;
  return $ret;
}

# $Log: grokNavItin.pl,v $
# Revision 1.8  2002/06/04 01:45:38  connolly
# fix 12a bug
#
# Revision 1.7  2001/12/19 22:37:23  connolly
# relaxed spacing constraints etc. for Jan 2002 itinerary; cf /2002/01dc-nj/
#
# Revision 1.6  2001/08/09 15:52:05  connolly
# added thoughts on the cyc vocab
#
# Revision 1.5  2001/07/26 22:54:54  connolly
# allow tabs in some places
#
# Revision 1.4  2001/07/26 22:39:14  connolly
# allow FOR and REF to be on separate lines, parse airport names better
#
# Revision 1.3  2001/07/03 21:02:14  connolly
# bookmarked airport code stuff from Megginson Jul 1999
#
# Revision 1.2  2001/06/16 06:30:43  connolly
# the itinerary is almost certainly written with the intent that airport names are unambiguous; updated the RDF statements to reflect this; i.e. all occurences of the airport named CHICAGO use the same term.
#
