#!/usr/bin/perl use strict; # # Turn debug mode on/off # my $verbose = 0; #my $verbose = 1; use CGI; use CGI::Carp 'fatalsToBrowser'; use Win32::ODBC; use Data::Dumper; $Data::Dumper::Indent = 1; use constant AUTONOMYTAG => 'originator=autonomy'; # # This program is called one of 3 ways, indicated by the URL parameter. # It's one of: # URL=/doifinder/... # URL=/uidfinder/... # URL=/linkfinder/... # use constant DOICALL => 'doi'; use constant UIDCALL => 'uid'; use constant LINKCALL => 'link'; # # JL - 033005 - took out comment previously for npg.natureny.com as staging # and now commented foxtrot.nature.com as staging # use constant NPGSTAGING => 'http://npg.natureny.com'; # use constant NPGSTAGING => 'http://foxtrot.nature.com'; use constant NPGLIVE => 'http://www.nature.com'; use constant PALGRAVESTAGING => 'http://staging.palgrave.nature.com'; use constant PALGRAVELIVE => 'http://www.palgrave-journals.com'; # # Needed this special BEGIN block because we need to # fiddle library paths between the live and staging servers # my $fileServerRoot; #my $jInfo; #my $artRec; #my $journal; #my $isFramesSite; #my $isNatureArchive; my $isLiveServer; BEGIN { # # New way depends on a newly created environment variable LOCATION. # Also, defaults to live server if in doubt. # my $host = $ENV{'COMPUTERNAME'}; if ($host && $host eq 'SNAPPLE') { $fileServerRoot = "\\\\snapple\\web_d\\web"; $isLiveServer = 0; } else { $fileServerRoot = "\\\\web-server-1\\web_d\\web"; $isLiveServer = 1; } push(@INC, "$fileServerRoot\\npg\\lib\\perl"); } # # New template tool kit. # use Template; # # Nature, Specialist Journals, and Palgrave Journal information # use JournalInfo; # # Needed for Google mods. # use GoogleAddress; #use HTTP::Date; use File::stat; #my $ifModSince; # seems like I've seen this one before... my $monthmap = { "01" => "January", "02" => "February", "03" => "March", "04" => "April", "05" => "May", "06" => "June", "07" => "July", "08" => "August", "09" => "September", "10" => "October", "11" => "November", "12" => "December", }; # staging or live? my($server) = $ENV{'SERVER_NAME'}; my $DYNAPAGEURI = '/cgi-taf/DynaPage.taf?file='; #my $DYNAPAGEURL = "http://$server" . $DYNAPAGEURI; #Added for Google compliance my $metaS_URL = "DynaSearch\\MetaS"; my $natureLink = qq(Nature.com); # we need this to get input key value pairs my $fd = new CGI; my $debug = $fd->param('debug'); $verbose = 1 if $debug == 42; # only the dolphins know this secret if($verbose) { my $hostName = `hostname`; print <doifinder debug

hostname: $hostName
server: $server
file server root: $fileServerRoot
EOF

    print "QUERY_STRING: $ENV{QUERY_STRING}\n";
	#print all arguments and values
	my @all = $fd->param;
	foreach my $a (@all) {
		my @v = $fd->param($a);
		print "$a:\n";
		foreach my $v (@v) {
			print "  $v\n";
		}
	}
}

#
# DSN - this one does the right thing on either server
#
my $DSN = "DSN=DynaSearch;UID=tango_client;PWD=dancetango";

my $dbh;	# database handle

#
# This program is called like this:
# http://www.nature.com/cgi-bin/doifinder.pl?URL=/doifinder/10.1038/123435
# or
# http://www.nature.com/cgi-bin/doifinder.pl?URL=/uidfinder/10.1038/123435
# or
# http://www.nature.com/cgi-bin/doifinder.pl?URL=/linkfinder/10.1038/123435
#
# This is accomplished by an IIS redirect from the original URL.
#

# get the URL parameter and parse
my($URL) = $fd->param('URL');

my($whichCall, $doi) = $URL =~ m,/(doi|uid|link)finder/(.*?)$,i;

# bail if there's no DOI
unless ($doi) {
    my $redirect = "http://$server/crossref/crossref_error_handle.html";
    doRedirect($redirect, "no doi to lookup");
    exit 0;
}

# patch in handle on old uidfinder artid 
$doi = uidFinderFixDOI($doi);

# old sj/palgrave period/slash patch
$doi = sjPalgraveFixDOI($doi);

print "DOI: $doi WHICH: $whichCall\n" if $verbose;

#
# From here on, we need database records.
#
my $articles = lookupArticleByDOI($doi);
my $articleCount = scalar(@$articles);

#
# if there's no database record(s), not much we can do.
#
if ($articleCount == 0) {
    my $redirect = "http://$server/crossref/crossref_error_handle.html";
    doRedirect($redirect, "No database record");
    exit 0;
}

#
# Special handling for google bot.
#
if (isGoogle()) {
    #
    # Note, if there are multiple renditions, we just give
    # the first one to google. They should all have the same content,
    # more or less. This is not really defined.
    #
    buildGoogleHTML($$articles[0]);
    exit 0;
}

#
# For news@nature and Nature Clinical Practice (and more in the future),
# we have the possibility of multiple versions of an article on-line.
# This means we need a renditions page.
#
if ($articleCount > 1) {
    buildRenditionsPage($articles, $whichCall);
    exit 0;
}

#
# We have a single article.
# Redirect to content or build citation page
#
my $aRec = $$articles[0];
unless (handleOutput($aRec)) {
    my $redirect = "http://$server/crossref/crossref_error_handle.html";
    doRedirect($redirect, "error in handleOutput");
    exit 0;
}

$dbh->Close if $dbh;

if ($verbose) {
	print <


EOF
}

exit 0;

#
# Replace slashes (/) after sj|palgrave with periods (.)
# 10\.1038/sj[./](.*?)[./](.*)$
#
sub sjPalgraveFixDOI {
	my($olddoi) = @_;

    return $olddoi unless $doi =~ /\/(sj|palgrave)/i;

	print "Fix DOI (old) $olddoi " if $verbose;
	my($front, $back) = $olddoi =~ m/(.*?\/)((sj|palgrave).*?)$/;
	$back =~ s/\//\./g;
	my $newdoi = "$front$back";	#rebuild
	print "(new) $newdoi\n" if $verbose;

	return $newdoi;
}

#
# Patch up old uidfinder URLs which used just the article ID
# and not the full DOI.
#
sub uidFinderFixDOI {
    my($doi) = @_;

    return $doi unless $whichCall eq UIDCALL;

    return $doi unless $doi !~ m,/,;

    print "patching in handle for $doi\n" if $verbose;
    $doi = '10.1038/' . $doi;
}

#
# Use the template tool kit and info from the database to build
# a citation page.
#
sub buildCitation {
	my($artRec) = @_;

    print "build citation:\n" if $verbose;

	print "Content-Type: text/html\n\n" unless $verbose;

	#
	# The template
	#
	my $shell = 'pageshell_citation';

	#
	# Search path for templates. This allows for overrides on
	# a per-journal basis (not yet used).
	#
	my $pathList = [
		"$fileServerRoot\\npg\\lib\\parts\\$artRec->{ProdID}",
		"$fileServerRoot\\npg\\lib\\parts",
	];

    #
    # untested!
    #
    if (my $newTemplateDir = $artRec->{JINFO}->getCitationTemplateDir()) {
      unshift(@$pathList, "$fileServerRoot\\npg\\lib\\parts\\$newTemplateDir");
    }

	print Dumper($pathList) if $verbose;

	#
	# Calculate some other stuff we need.
	#
	my $journalTitle = JournalInfo->titleFromProdID($artRec->{ProdID});
	my ($year) = convert_date($artRec->{PD_date});

	my ($spn, $epn); 
	if ($artRec->{Spn} eq $artRec->{Epn}){
		$spn = $artRec->{Spn};
		$epn = undef;
	}
	else{
		$spn = $artRec->{Spn};
		$epn = $artRec->{Epn};
	}
	
	#
	# Variables that the template toolkit will need to build the page.
	#
	my $vars = {
		prodid => $artRec->{ProdID},
		atl => $artRec->{Atl},
		vol => $artRec->{Vol},
		issue => $artRec->{Iss},
		authors => buildAuthorDisplay($artRec->{ArtID}, $artRec->{ProdID}, 0),
		spn => $spn,
		epn => $epn,
		journal => $journalTitle,
		#date => $date,
		year => $year,
		fullfile => $artRec->{FULLLINK},
	};

    #
    # Special rules for Nature archive when there's not a 
    # useable abstract. We present the PDF link
    # along with some special text. See buildNPGDynapageLinks() for more.
    #
    if ($artRec->{ISNATUREARCHIVE}) {
        delete $vars->{fullfile};
        $vars->{naturearchive} = 1;
        $vars->{pdfurl} = $artRec->{PDFLINK} if $artRec->{PDFLINK};
    }

	#
	# Get a template object.
	#
	my $template = Template->new({
		INCLUDE_PATH => $pathList,
	});

	#
	# Process the template, writing the output to STDOUT.
	#
	if ($verbose) {
		print "\ntemplate call: ", Dumper($vars);
	}
	else {
		unless ($template->process($shell, $vars)) {
			print "ERROR: ", $template->error(), "\n";
		}
	}
}
	
#
# New field is date/time, format, directly from the database:
#
# 'PD_date' => '2000-09-01 00:00:00.000',
#
# All we need is the year.
#
sub convert_date {
	my($date) = @_;
	my($year) = $date =~ /^(\d\d\d\d)/;
	return $year;
}

sub openDB {
	my $con;

	if (!($con = new Win32::ODBC($DSN))){
		print "Error connecting to DSN DynaSearch\n";
		print "Error: " . Win32::ODBC::Error() . "\n";
		exit;
	}
	return $con;
}

#
# Newest version allows for multiple records for a DOI.
#
sub lookupArticleByDOI {
	my($doi) = @_;

	print "Lookup by DOI\n" if $verbose;
	#
	# From here on, we need the database, so open a connection.
	#
	$dbh = openDB() unless $dbh;

	#
	# look up the article
	#
	my($SqlStatement) = <Sql($SqlStatement)){
		print "ERROR: " . $dbh->Error() . "\n";
		return undef;
	}

    my $renditions = [];

    my $count = 0;
	while ($dbh->FetchRow()) {
		my %Data = $dbh->DataHash();
        #
        # Nervous fix. Because of changes to DynaPage, upper case
        # letters on live are not working. Patch here, and hope that
        # there are no true upper case letters in any HTML file names.
        #
        $Data{SystemID} = lc($Data{SystemID});

        if($verbose){
            print "artRec $count: " . Dumper(\%Data);
        }
        #
        # Go ahead and assign a journal info object here, since
        # we use it more and more. Most often, multiple versions for
        # a DOI will be from different journals.
        #
        my $prodID = $Data{'ProdID'};
        my $ji = new JournalInfo($prodID);
        die "no journal info for $prodID\n" unless $ji;
        $Data{JINFO} = $ji;
        push(@$renditions, \%Data);
        $count++;
	}

    #
    # Let's figure out the links for all records here, for more
    # flex later.
    #
    foreach my $art (@$renditions) {
        if ($art->{JINFO}->isFoxtrotTitle()) {
            buildFoxtrotLinks($art);
        }
        elsif ($art->{JINFO}->isNPG) {
            buildNPGDynapageLinks($art);
        }
        else {
            buildSJDynapageLinks($art);
        }
    }

	return $renditions;
}

sub doRedirect {
	my($redirect, $reason) = @_;

	if ($verbose) {
		print "REDIRECT: $reason:\n  to $redirect\n";
	}
	else {
		print $fd->redirect("$redirect");
	}
}

#
# Routine used to determine if call is from a google search address or not.
#	Google calls get special handling.
#
# Discovering Google IP addresses has gotten more complicated
# because of some proxying going on. Instead of just checking the
# value of HTTP_REMOTE_ADDR, we need to make a list of addresses
# to check. If any one of them is a Google address, we do the special stuff.
#
sub isGoogle {

    #return 1; # for testing

	my @addressList;

	#load list with possible address(es)
	if ($ENV{HTTP_X_FORWARDED_FOR}) {
		my $addresses = $ENV{HTTP_X_FORWARDED_FOR};
		if ($addresses =~ ',') {
			@addressList = split /,[ ]?/, $addresses;
		} else {
			push(@addressList, $addresses);
		}
	}

	#tack on REMOTE_ADDR
	push(@addressList, $ENV{REMOTE_ADDR});

    print "Checking Addresses: " . Dumper(\@addressList) if $verbose;

	#loop through addresses, see if we get a hit
	foreach my $a (@addressList) {
		if(GoogleAddress::isGoogleAddress($a)) {
			print "isGoogle\n" if $verbose;
			return 1;
		}
	}

	return 0;
}

#
# Special handling routine for calls from google search engine.
#	Return html rather than link(s)
#
sub buildGoogleHTML {
	my($artRec) = @_;

    print "buildGoogleHTML:\n" if $verbose;

    my $journal = lc($artRec->{ProdID});
    my $publisherRoot;
	my $page;
	my $doReferences=0;

	my $ifModSince = $ENV{'HTTP_IF_MODIFIED_SINCE'};

    print "ModTime: [$ifModSince]\n" if $verbose;

	my $googleModTime;
    if ($ifModSince) {
		$googleModTime = &str2time($ifModSince);
    }

    if ($verbose) {
        print "googleModTime: $googleModTime\n" if $googleModTime;
    }

	#See if there are references
	$doReferences = $artRec->{JINFO}->doAJRefParse;

    if ($artRec->{JINFO}->isPAL) {
        print "is a palgrave title\n" if $verbose;
        $publisherRoot = "palgrave";
    }
    else {
        #See if there are references
        $doReferences = $artRec->{JINFO}->isNPG;
        $publisherRoot = "npg";
    }

	my $sysID = $artRec->{SystemID};
	my $vol = $artRec->{Vol};
	my $iss = $artRec->{Iss};

	my $refPath="";
	my $isModified=0;
	#Build reference location path
	if ($doReferences) {
        $refPath =
            "$fileServerRoot\\$publisherRoot\\$journal\\journal\\v$vol\\n$iss\\refs\\$sysID.fhtml";

		#Check timestamp of reference file...
		if( -e $refPath) {
            print "refpath: $refPath exists\n" if $verbose;
			if ($googleModTime && (stat($refPath)->mtime > $googleModTime)) {
				++ $isModified;
			}
		}
	}

	#Build the spider food path (all spider food under npg/)
    my $path = "$fileServerRoot\\npg\\$metaS_URL\\$journal\\journal\\v$vol\\n$iss\\$sysID.htm";

	#Check timestamp of spider food ...
	if ( -e $path) {
        print "spider food: $path exists\n" if $verbose;
		if ($googleModTime && (stat($path)->mtime > $googleModTime)) {
			++ $isModified;
		}
	}

    #
	# If neither spider food nor references are more recent
    # than google time, no more work necessary.
    #
	if ($googleModTime && $isModified == 0) {
		print "Stop HTTP304\n" if $verbose;
		#print "HTTP/1.0 304 Not Modified\n";
		#print "Date: $ifModSince\n\n";
		doRedirect('/cgi-bin/nph-not-modified.pl', "redirect to not modified");
		exit 0;
	}

	#open & load spider food
	my $contents;
    if ( -e $path ) {
        local $/ = undef;
        open( FILE, $path ) && ( $contents =  ) && close(FILE)
              || print "Can not load file:\t" . $path . "\t$!\n";
    }
    else {
        print "Can not locate file:\t" . $path . "\n";
	}

	$page = $contents;

	#
	# Strip off existing bib, if any
	# Prepend references, if any, and nature url before close body tag
	#
    $page =~ s@(