#!/usr/local/bin/perl -w

#
#   Copyright (c) 2001 MIT, INRIA, Keio.   All rights reserved.
#   Initially written by Sandro Hawke (sandro@w3.org)
#
#   $Id: Pool.pm,v 1.10 2001/12/26 20:48:07 sandro Exp $
#

=head1 NAME

RDF::Pool -- RDF (XML) (de)serialization of Perl objects

=head2 SYNOPSIS

    Foo.
    
    RDF::Object;
    my $o = RDF::Object::getLiteral("foo..."[, databasename]);
    my $o = RDF::Object::get("foo..."[, databasename]);
                         load(...)
    (pool)
    (save)
             loadAttrs...    implicit?

    $o = RDF::Database::obtain("http://foo.");

        RDF::Database::import(uri, pattern);
        RDF::Database::export(uri, pattern);
        my @o = RDF::Database::objects();
        my @triples = RDF::Database::triples();

        my $o = RDF::Database::object('name');
        # special case of filtering (querying), where x.uname=arg
        
        $o->getAll($p)
        $o->getAnyOne($p)
        $o->getOnly($p)
        $o->getCount($p)
        $o->add($p, $v)
        $o->set($p, $v)
        $o->setAll($p, @v)
        
          $p can be scalar uname of property object,
             or the property object itself.
          $v is always the value object itself, unless
             it's a string.

    use RDF::Pool;
    my $pool = RDF::Pool->new("http://sample.com/cars#");
    $pool->add_prefix("_color_", "http://sample.com/color#");
    my $car = bless { make=>"Honda", model=>"Civic" }, Car;
    $car->{color} = bless { _color_name=>"Light Blue" }, _color_Color;
    $pool->add($car);
    $pool->load("http:...");
    $pool->load_rdfxml_file("somefile.rdf");
    $pool->save_rdfxml_file("somefile.rdf", lock=>1, append=>1);

    ... produces: 

    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
             xmlns:color="http://sample.com/color#"
             xmlns="http://sample.com/cars#">
      <Car rdf:about="#obj0">
        <make>Honda</make>
        <color>
          <color:Color rdf:about="#obj1">
            <color:name>Light Blue</color:name>
          </color:Color>
        </color>
        <model>Civic</model>
      </Car>
    </rdf:RDF>

=head2 BASIC CONCEPT

This package contains tools for loading and saving sets of Perl
objects to RDF/XML files.  The mapping is intended to be generally
intuitive, so Perl programmers can use the data structures they want,
while the RDF/XML is decent for use with other toolkits.

In general, a "pool" is a dynamic set of RDF Statements (a mutable RDF
Graph).  An RDF::Pool object lets you convert the information between
the RDF/XML encoding and a natural encoding as Perl objects.

=head2 NAMESPACES AND PREFIXES

In RDF, objects and attributes are identified by strings conforming to
the URI-Reference syntax.  A set of names with a common prefix
(usually ending in "#") are sometimes considered a namespace, because
in the XML/RDF encoding, they can appear as XML namespaces.  Here,
were treated them simply as common prefixes; they are called
"rdf_prefix" strings in the prefix table.

Similarly, in Perl, we assume that attributes may be grouped by common
prefixes, which we call perl_prefix in the prefix table.

The two default entries in the table are these:

   perl_prefix    rdf_prefix
   ===========    ==========
    "_rdf_"        "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    ""             "#" (unless set by the parameter to new())

This means the RDF "type" property, whose full RDF name is
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type" will be mapped
to/from "_rdf_type" when used as a hash key on serialize objects.

The second entry in the table says that Perl keys not matching earlier
entries in the table should be prefixed as given.  This is essentially
the default namespace.

=head2 MULTIPLE VALUES

Unlike Perl, RDF allows an object to have an attribute with multiple values. 

      car >- near -> "house".
      car >- near -> "tree".

After reading data like this, we'll have

      $car->{near} = "house";
      $car->{@near} = [ "house", "tree" ];

=cut

package RDF::Pool;

use strict;
use Carp;
use Data::Dumper;    # just for diagnostics

BEGIN {

    use Exporter   ();
    use vars       qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

    # set the version for version checking
    $VERSION     = 0.01;
    
    @ISA         = qw(Exporter);
    @EXPORT      = qw();
    %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
    
    # your exported package globals go here,
    # as well as any optionally exported functions
    @EXPORT_OK   = qw();
}
use vars      @EXPORT_OK;

# non-exported package globals go here
use vars      qw($load_count);

# initialize package globals, first exported ones

# then the others (which are still accessible as $Some::Module::stuff)
my $load_count=0;

# all file-scoped lexicals must be created before
# the functions below that use them.

# file-private lexicals go here
my $uname = "http://www.w3.org/2001/12/uname#uname";
my $rdfns="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
my $damlns="http://www.daml.org/2001/03/daml+oil#";
my $pwd=`pwd`;
chomp $pwd;

sub new {
    my $this = shift;
    my @args = (@_);
    my $class = ref($this) || $this;
    my $self = {};
    bless $self, $class;
    $self->initialize(@args);
    return $self;
}

sub initialize {
    my $this = shift;
    my $arg = shift;

    $this->{top_level_objects} = { };

    if (ref($arg) eq "") {
	$this->init_from_string($arg);
    } elsif (ref($arg) eq "HASH") {
	$this->init_from_hash($arg);
    } else {
	croak("RDF::Pool->new() parameter type unknown: ".ref($arg));
    }

    $this->add_prefix("_rdf_", "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdf");
    $this->add_prefix("_daml_", "http://www.daml.org/2001/03/daml+oil#", "daml");
}

sub init_from_hash {
    my $this = shift;
    my $h = shift;

    while (my ($key, $value) =  each %$h) {
    }
}

sub init_from_string {
    my $this = shift;
    my $s = shift;

    $s = "#" unless defined $s;

    # this should be last in the table for other perl prefixes to work
    $this->add_prefix("", $s, "");
}

sub add_prefix {
    my $this = shift;
    my $perl_prefix = shift;
    my $rdf_prefix = shift;
    my $xml_prefix = shift;

    unshift @{$this->{prefix_table}}, { 
	perl_prefix=>$perl_prefix,
	xml_prefix=>$xml_prefix,
	rdf_prefix=>$rdf_prefix,
    }
}

sub add {
    my $this = shift;
    my $added = shift;

    $this->{top_level_objects}->{$added} = $added;
}

sub top_level_objects {
    my $this = shift;
    return values %{$this->{top_level_objects}};
}

sub ntriples_unescape {
    my $c = shift;

    # from http://www.w3.org/2001/sw/RDFCore/ntriples/#sec-string1
    return "\\", if $c eq "\\";
    return "\"", if $c eq "\"";
    return "\n", if $c eq "n";
    return "\r", if $c eq "r";
    return "\t", if $c eq "t";

    croak("bad backslash-escaped character '$c'");
}

sub get_object {
    my $this = shift;

    my $name = shift;
    my $bnodes = shift;

    # "literal"
    if ($name =~ m/^"(.*)"$/) {
	my $result = $1;
	$result=~ s/\\(.)/ntriples_unescape($1)/ge;
	return $result;
    }

    # _:bNode?
    if (defined $bnodes && $name =~ m/^_/) {
	if (exists $bnodes->{$name}) { return $bnodes->{$name}; }
	my $result = { };
	$result->{_raw}->{_cooked} = $result;
	bless $result->{_raw}, "RDF::Object";
	$bnodes->{$name} = $result;
	return $result;
    }

    # <rdfid>
    if ($name =~ m/^<(.*)>$/) {
	$name = $1;
    }

    if (exists $this->{rdfid}->{$name}) {
	return $this->{rdfid}->{$name}
    } else {
	my $result = { _rdf_identifier=>$name };
	$result->{_raw}->{_cooked} = $result;
	$result->{_raw}->{"<$uname>"}->{"<$name>"} = 1;   # BUG    1?
	bless $result->{_raw}, "RDF::Object";
	$this->{rdfid}->{$name} = $result;
	return $result;  
    }
}

sub split_rdf_name {
    my $in = shift;

    # make the second part as long as possible while still a Qname
    if ($in =~ m/(.*?)([a-zA-Z_][-a-zA-Z_0-9.]*)$/) {
	if ($1 eq "") { die("null namespace guessed from '$in'"); }
	return ($1, $2);
    }
    croak("Can't rdf/xml encode \"$in\" as a predicate\n.");
}

sub perl_name_from_rdf_name {
    my $this = shift;
    my $rdf_name = shift;

    foreach my $e (@{$this->{prefix_table}}) {
	my $rdf_prefix = $e->{rdf_prefix};
	my $len = length($rdf_prefix);
	#print STDERR "Is substr(\"$rdf_name\",0,$len) eq \"$rdf_prefix\"?\n";
	if (substr($rdf_name, 0, $len) eq $rdf_prefix) {
	    return $e->{perl_prefix} . substr($rdf_name, $len);
	}
    }
    
    # we could return undef, if we didn't want this stuff at all

    return "<" . $rdf_name . ">";
}

sub rdf_name_from_perl_name {
    my $this = shift;
    my $perl_name = shift;
    
    if ($perl_name =~ m/^<(.*)>$/) {
	return wantarray ? split_rdf_name($1) : $1;
    }

    foreach my $e (@{$this->{prefix_table}}) {
	my $perl_prefix = $e->{perl_prefix};
	my $len = length($perl_prefix);
	if (substr($perl_name, 0, $len) eq $perl_prefix) {
	    return wantarray ?
		   ($e->{rdf_prefix} , substr($perl_name, $len), $e->{xml_prefix})
            	 : ($e->{rdf_prefix} . substr($perl_name, $len))
	}
    }
    die();
}

sub load_file {
    my $this = shift;
    my $filename = shift;
    my $baseURI = shift || "file://$pwd/$filename";

    $this->{lastLoadBaseURI} = $baseURI;

    if ( ($baseURI || $filename) =~ m/.nt$/ ) {
	return load_ntriples($this, $filename, "file:".$filename, $baseURI);
    }

    if ( ($baseURI || $filename) =~ m/.n3$/ ) {
	return load_n3_file($this, $filename, $baseURI);
    }

    return load_rdfxml_file($this, $filename, $baseURI);
}

sub load_rdfxml_file {
    my $this = shift;
    my $filename = shift;
    my $baseURI = shift;

    my $pseudoBase = "file:$filename";

    #print "pseudoBase $pseudoBase realBase $baseURI\n";
    return $this->load_ntriples("rdfdump --scan --output=ntriples --quiet $pseudoBase |", $pseudoBase, $baseURI, "^rdfdump: ");
}

sub load_n3_file {
    my $this = shift;
    my $filename = shift;
    my $baseURI = shift;

    my $pseudoBase = "file:$filename";

    return $this->load_ntriples("cwm $pseudoBase --ntriples | tee /tmp/cwm.out |", $pseudoBase, $baseURI, "^\\S*\$");
}

sub cleanPrefix {
    my $in = shift;
    my $replaceThisUriPrefix = shift; 
    my $withThisUriPrefix = shift;
    my $out = $in;

    # print "Cleaning '$in' with '$replaceThisUriPrefix' -> '$withThisUriPrefix'\n";

    # this is just a raptor bug workaround
    if ($out =~ m/^\<genid(\d+)\>$/) {
	$out = "_:genid$1";
    }
    # this is just a raptor bug workaround
    if ($out =~ m/^<([^:]+)>$/) {
	$out="<$withThisUriPrefix#$1>";
    } else {
	$out =~ s/^<$replaceThisUriPrefix/<$withThisUriPrefix/;
    }
    # print "==> '$out'\n";
    return $out;
}

sub load_ntriples {
    my $this = shift;
    my $filename = shift;
    my $replaceThisUriPrefix = shift; 
    my $withThisUriPrefix = shift;
    my $ignoreable_line_pattern = shift;

    my $bnodes = {};
    my @result;
    my %in_result;

    open(F, $filename) or 
	die("can't read \"$filename\": $!");
    while (<F>) {
	next if $ignoreable_line_pattern && $_ =~ $ignoreable_line_pattern;
	s/^\s*//;
	next if m/^$/;
	next if m/^\s*\#/;
	my ($subj, $pred, $obj) = split(/\s+/, $_, 3);
	unless (defined $obj) {
	    print "Bad NTriples Line: $_\n";
	    next;
	}
	$obj =~ s/\s*\.\s*$//;

	# print "'$subj', '$pred', '$obj' ==> ";

	if (defined $withThisUriPrefix) {
	    $subj = cleanPrefix($subj, $replaceThisUriPrefix, $withThisUriPrefix);
	    $pred = cleanPrefix($pred, $replaceThisUriPrefix, $withThisUriPrefix);
	    $obj  = cleanPrefix($obj, $replaceThisUriPrefix, $withThisUriPrefix);
	}
	# print "TRIPLE: '$subj', '$pred', '$obj'\n";
	if ($this->{triple_hook}) { 
	    if ($this->{triple_hook}->($subj, $pred, $obj)) {
		goto after_add;
	    }
	} 

	my $s = $this->get_object($subj, $bnodes);
	my $p = $this->get_object($pred, $bnodes);
	my $o = $this->get_object($obj,  $bnodes);

	# raw
	if (ref($o)) {
	    $s->{_raw}->{$pred}->{$obj} = $o->{_raw};
	} else {
	    $s->{_raw}->{$pred}->{$obj} = $o;
	}
	$this->{subjects}->{$s}=$s;
	
	# cooked
	my $pname = $p->{_rdf_identifier};
	if ($pname eq "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") {
	    # BUG: assume there's only one rdf:type for the object
	    bless $s, $this->perl_name_from_rdf_name($o->{_rdf_identifier});
	} else {
	    $pname = $this->perl_name_from_rdf_name($pname);
	    if (exists $s->{$pname}) {
		if ( ! exists $s->{"@".$pname}) {
		    push @{$s->{"@".$pname}}, $s->{$pname};
		} 
		push @{$s->{"@".$pname}}, $o;
	    } else {
		$s->{$pname} = $o;
		# print STDERR "spo: $s $pname $o\n";
	    }
	}
	
	if (! $in_result{$s}) {
	    push @result, $s;
	    $in_result{$s} = 1;
	    $this->add($s);
	}

      after_add:
    }
    close(F);
    return @result;
}

sub load {
    my $this = shift;
    my $source = shift;

    $source =~ s/\#.*//;

    if ( ! ($source =~ /:/) ) { return $this->load_file($source); }

    # pretty kludgy

    my $tmp = "/tmp/rdf_pool_" . $$ . "_" . ($load_count++);
    my $auth = "";

    # hack since we can't do content negotiation properly!
    foreach my $suffix (".rdf", ".xml", "") {
	my $got_it;
	open(P, "wget $auth -O $tmp '$source$suffix' 2>&1 |") or die("running wget");
	while(<P>) {
	    if (/^HTTP request sent, awaiting response... 200/) { $got_it = 1; }
	    if (/^HTTP request sent, awaiting response... /) { 
		print STDERR "wget $source$suffix\n==> $_\n";
	    }
	}
	close(P);
	if ($got_it) { goto GOT_IT; }
    }
    croak("Can't obtain content from $source");

  GOT_IT:
    print STDERR "got data, $tmp\n";
    $this->load_file($tmp, $source);
    # unlink $tmp;
}

sub xquote {
   my $in = shift @_;
   $in =~ s/&/&amp;/g;
   $in =~ s/</&lt;/g;
   $in =~ s/>/&gt;/g;
   return $in;
}

sub xml_name_from_perl_name {
    my $this = shift;
    my $perl_name = shift;
    my $nameSpaces = shift;     # updated if a new namespace is used
    my $revNameSpaces = shift;  # updated if a new namespace is used

    my ($namespace, $localpart, $xml_prefix) = 
	$this->rdf_name_from_perl_name($perl_name);

    # print STDERR "PerlName='$perl_name' ==> rdfpre='$namespace'  localpart='$localpart' xmlpre='$xml_prefix'\n";

    if (exists $nameSpaces->{$namespace}) {
	$xml_prefix = $nameSpaces->{$namespace};
	if ($xml_prefix) {
	    return $xml_prefix . ":" . $localpart;
	} else {
	    return $localpart;
	}
    }

    if (! defined $xml_prefix) {
	# guess some prefix
	if ($namespace =~ m/^(.*?)([a-zA-Z_][-a-zA-Z_0-9.]*).?$/) {
	    $xml_prefix = $2;
	} else {
	    $xml_prefix = "ns";
	}
    }
    
    my $count = "";
    while (exists $revNameSpaces->{$xml_prefix . $count}) {
	$count++;
    }
    $xml_prefix = $xml_prefix . $count;
    $revNameSpaces->{$xml_prefix} = $namespace;
    $nameSpaces->{$namespace} = $xml_prefix;
    if ($xml_prefix) {
	return $xml_prefix . ":" . $localpart;
    } else {
	return $localpart;
    }
}

sub rdfxml_description {
    my $this = shift;
    my $obj = shift;
    my $ns = shift;
    my $rns = shift;
    my $done_map = shift;
    my $indent = shift || 0;
    my $result = "";

    my $class = ref $obj;
    print STDERR "$indent rdfxml_desc entered with $obj\n";

    if (! $class) {
	return $obj;
    }

    if ($done_map->{$obj}) { print STDERR "already been done!  don't call me!"; }

    my $type = $class;
    if ($class eq "HASH") {
	my $explicit_type = $obj->{_rdf_type};
	if ($explicit_type) {
	    if ($type && $explicit_type ne $type) {
		croak("object with contradictory types");
	    }
	    $type = $explicit_type;
	} else {
	    $type = undef;
	}
    } elsif ($class eq "ARRAY") {
	croak("arrays not handled at top level (@$obj)");
    } else {
	if (!$type) {
	    croak("can't serialize $class objects without more information.");
	}
    }

    my $id = $obj->{_rdf_identifier};
    unless ($id) { $id = "#obj" . $done_map->{__COUNT}++; }
    $done_map->{$obj} = $id; 

    my $prefix = " " x $indent;

    my $tag;
    if ($type) {
	$tag = $this->xml_name_from_perl_name($type, $ns, $rns);
    } else {
	$tag = "rdf:Description";
    }
    $result .= "$prefix<$tag rdf:about=\"" . xquote($id) . "\">\n";

    while (my ($key,$val) = each %$obj) {
	next if ($key eq "_rdf_type");
	next if ($key eq "_rdf_identifier");

	#print STDERR "key: $key\n";
	my $multivalued = 1;
	if ($key =~ m/^@(.*)/) {
	    $key = $1;
	    $multivalued = 1;
	}
	my $attr = $this->xml_name_from_perl_name($key, $ns, $rns);

	if ($done_map->{$val}) {
	    $result .= "$prefix  <$attr rdf:resource=\"" . $done_map->{$val} . "\" />\n";
	} else {
	    if (ref $val eq "ARRAY") {
		if ($multivalued) {
		    for my $v (@$val) {
			if (ref $v eq "ARRAY") { croak("can't handle nested arrays"); }
			if (ref $v) {
			    $result .= "$prefix  <$attr>\n";
			    $result .= $this->rdfxml_description($v, $ns, $rns, 
								 $done_map, $indent+4);
			    $result .= "$prefix  </$attr>\n";
			} else {
			    $result .= "$prefix  <$attr>" . xquote($v) . "</$attr>\n";
			}
		    }
		} else {
		    # encode it as an RDF list <li>
		    die("not implemented");
		}
	    } elsif (ref $val) {
		$result .= "$prefix  <$attr>\n";
		$result .= $this->rdfxml_description($val, $ns, $rns, 
						     $done_map, $indent+4);
		$result .= "$prefix  </$attr>\n";
	    } else {
		$result .= "$prefix  <$attr>" . xquote($val) . "</$attr>\n";
	    }
	}
    }
    
    $result .= "$prefix</$tag>\n";

    return $result;
}


sub as_rdfxml {
    my $this = shift;

    my $ns = { "http://www.w3.org/1999/02/22-rdf-syntax-ns#" => "rdf" };
    my $rns = { "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#" }; 
    my $done_map = { };

    my $contents = "";
    
    foreach my $obj ($this->top_level_objects()) {
	$contents .= $this->rdfxml_description($obj, $ns, $rns, $done_map, 2)
	    unless $done_map->{$obj};
    }

    # print Dumper($ns);

    my $result = "<rdf:RDF ";
    my $def;
    while (my ($key,$val) = each %$ns) {    
	if ($val eq "") { $def = $key; next; }
	$result .= "xmlns:$val=\"$key\"\n         ";        
    }
    if ($def) {
	$result .= "xmlns=\"" . $def . "\"";
    }
    $result .= ">\n$contents</rdf:RDF>\n";
    return $result;
}

sub rdf_prefix_for_xml_prefix {
    my $this = shift;
    my $prefix = shift;

    foreach my $e (@{$this->{prefix_table}}) {
	return $e->{rdf_prefix}, if $e->{xml_prefix} && $e->{xml_prefix} eq $prefix;
    }
    return undef;
}

sub save_rdfxml_file {
    my $this = shift;
    my $filename = shift;
    my %options = (@_);

    use Fcntl ':flock'; # import LOCK_* constants

    if ($options{append}) {
	open(OUT, ">>", $filename) || croak("open $filename: $!");
    } else {
	open(OUT, ">", $filename) || croak("open $filename: $!");
    }

    if ($options{lock}) {
	flock OUT, LOCK_EX  || croak("flock $filename: $!");
	if ($options{append}) {
	    seek(OUT, 0, 2);  # reseek to the end, now that we have it locked
	}
    }

    print OUT $this->as_rdfxml();

    if ($options{lock}) {
	flock OUT, LOCK_UN;
    }
    close(OUT);
}

sub forEachTriple {
    my $this = shift;
    my $func = shift;

    # total hack implementation for now!

    my $tmp = "/tmp/pool.$$.rdf";
    print "Using tmp file for text: $tmp\n";
    print STDERR __LINE__ . "\n";;
    $this->save_rdfxml_file($tmp);
    print STDERR __LINE__ . "\n";;
    my $that = RDF::Pool->new();
    print STDERR __LINE__ . "\n";;
    $that->{triple_hook} = $func;
    print STDERR __LINE__ . "\n";;
    $that->load($tmp);
    print STDERR __LINE__ . "\n";;
    $that = undef;

    unlink $tmp;
}

sub elements {
    my $obj = shift;

    # should compare to _daml_nil, but rdfdump isn't giving us those!
    # so this is the closing version.....

    unless ($obj->isa('_daml_List')) {
	print Dumper($obj);
	die;
    }

    my @rest = ();
    if ($obj->{_daml_rest}) {
	@rest = elements($obj->{_daml_rest});
    }
    return ( $obj->{_daml_first}, @rest );
}

sub raw_elements {
    my $obj = shift;

    # should compare to _daml_nil, but rdfdump isn't giving us those!
    # so this is the closing version.....

    unless ($obj->hasType("${damlns}List")) {
	print Dumper($obj);
	die;
    }

    my @rest = ();
    if ($obj->{"<${damlns}rest>"}) {
	# print "looking for rest of " . keys(%{$obj->{"<$uname>"}}) . "\n";
	@rest = raw_elements($obj->getOnly("${damlns}rest"));
    }
    return ( $obj->getOnly("${damlns}first"), @rest );
}



package RDF::Object;

sub hasType {
    my $this = shift;
    my $type = shift;

    return ($this->{"<${rdfns}type>"}->{"<$type>"} || 0);
}

sub getOnly {
    my $this = shift;
    my $property = shift;

    my $values = $this->{"<$property>"};

    if ($values) {
	my @allValues = values(%$values);
	if (@allValues == 1) {
	    return $allValues[0];
	} else {
	    # too many
	    die "Property $property has " . scalar(@allValues) . " values, not one.";
	}
    } else {
	# too few
	die "Property $property has no values, not one.";
    }
}

sub getMaybeOne {
    my $this = shift;
    my $property = shift;

    my $values = $this->{"<$property>"};

    if ($values) {
	my @allValues = values(%$values);
	if (@allValues == 1) {
	    # print "RETURNING: $allValues[0]\n";
	    return $allValues[0];
	} else {
	    # too many
	    die "Property $property has " . scalar(@allValues) . " values, not one.";
	}
    } else {
	return undef;
    }
}

    


1;
