#!/usr/local/bin/perl
# sfsubsearch.pl
#
# form variable names & values 
#  liblist: genbank  gbnew   pir  swissprot swissnew
#  query1 : words
#  class1 : AllText, ID, Accession, Date, Definition, Keywords, 
#           Organism, Authors, Title, Reference, Comment, Features, SeqLength
#  or1    : or  and  butnot
#  query2 : words
#  class2 : like class 1
#  searchapp: fasta  tfasta
#  ktup   : number
#  seqtype: nucleic  protein
#  inputseq: text block
#  seqname: words
#

###  Server specific vars
$maxload = '2';   # 2, whatever, system load level
$maxlib = '100000'; # max subset library entries (for test service)
$tmpdir= "/tmp/";  # this temp must lie in Gopher kingdom (below $gopath)

$SRSROOT	= "/b4/srs";
#$SRSROOT	= "/b6/tmp/srst";
$subpath	= "$SRSROOT/tmp/";
$subsetlib	= "srslib$$";
$logfile	= "/usr/tmp/srsfasta.log";
$sysloadaverage = "/usr/ucb/uptime";

#$seqsearchpath  = 'nice /b4/work/fasta/bin/';
$seqsearchpath  = 'nice /b4/srsfasta/bin/';

$SRSserver  = "http://iubio.bio.indiana.edu:81/srs/srsc";
$srsIdLinkFmt = "<a href=" . $SRSserver . "?[%s-id:%s]>%s</a>";
$libname= "genbank";  # genbank  gbnew   pir  swissprot swissnew

###  Data-specific vars

$title  = "SRS-FastA Sequence Subset Search";
#$title  = "GenBank Subset Search";
#$title = "Sequence Databank Subset Similarity Search";


###  static vars
$p = "'";
$q = '"';
$dohtml= 1;
$goplus = 0;
$isdialog= 1;
$or1='or'; 
$srsq= "";

### derived vars

$seqfile	= $tmpdir . "gbsub$$.seq";
$subsetoutput	=  $tmpdir . "gbsub$$.list";

### external subroutines

require "getopts.pl";
require "ctime.pl";


##
## MAIN
##

### read command-line
&Getopts('dhv:');

if ($opt_d) { $isdialog = 1; }
if ($ENV{'CONTENT_TYPE'} ne "") { $opt_v= $ENV{'CONTENT_TYPE'};}
if ($ENV{'CONTENT_LENGTH'} ne "") { $conlen= $ENV{'CONTENT_LENGTH'};}
if ($opt_v eq "application/gopher+-menu") { $goplus= 1; }
if ($opt_v eq "text/html") { $dohtml= 1;  }
if ($0 =~ "html" || $opt_h) { $dohtml = 3;  }  

if ($dohtml) {  
  print "Content-type: text/html\n\n<body><pre>\n";
  #print "Content-type: text/plain\n\n";

  if ($isdialog) {
    ## all parameters are on commandline w/ switch names
    ## !! POST method puts them in STDIN
    if ($#ARGV>-1) { $inputs= join(' ',@ARGV); }
    else { $inputs= ""; }

    if ($conlen) {
       # ! for n-httpd
       #print "reading $conlen bytes\n";
	     read (STDIN, $inputs, $conlen);
       @parms= split(/[ &]/, $inputs);
	    }
    else {
      while (<STDIN>) { 
       chop($_); 
       if ($_ ne "") { $inputs= join(' ',$inputs,$_); }
       }
     @parms= split(/[ ]/, $inputs);
     }
    #print "<p>params= ",join(' ',@parms),"<p>\n";

    foreach  (@parms) {
	    if (/=/) { 
	      ($name,$data)= split(/=/);
	      if ($data ne "") { 
					$data= &unescapeUrl( $data);
          if ($flds{$name} ne "") { $s= " "; } else { $s = ""; }
	        $flds{$name} .=  $s . $data;
	        }
	       }
	    ##elsif ($_ ne "") { $flds{$name} .= $_  . " "; }
	    }
    
		$liblist = $flds{"liblist"};
		$query1 = $flds{"query1"};
		$class1 = $flds{"class1"};
		$or1 = $flds{"or1"};
		$query2 = $flds{"query2"};
		$class2 = $flds{"class2"};
	
		$sublib= $flds{"sublib"};  # gbsectionsearch
		
		$searchapp= $flds{"searchapp"};
		$ktup		= $flds{"ktup"};
		
		$seqtype 	= $flds{"seqtype"};
		$seqname	= $flds{"seqname"};
		$inputseq	= $flds{"inputseq"};
    }
  else {
    #$query= join(' ',@ARGV); 
    die "Must use dialog entry format";
    }

  }
else {
  if ($isdialog) {
    #chop( $query = <STDIN>);
    die "Gopher+ dialog format not ready";
    }
  else {
    #$query= join(' ',@ARGV); 
    die "Must use dialog entry format";
    }

  }
 
if ( length($inputseq) < 1) { print "No input sequence.\n"; die;  }
if ( length($liblist) + length( $sublib) < 1) { print "No libraries to search.\n"; die; };



#
# Check system load & refuse if too high...
#

&CheckLoad();

if ($sublib) {
	$subsetlib= $sublib;
	print <<TEOF
<b>$title</b>

<i>Databank Section:</i>
   $sublib
<i>Sequence name:</i>
   $seqname
<i>Similarity search function:</i>
   $searchapp

TEOF
;
	if (open(LOG,">> $logfile")) {
		 chop( $date = &ctime(time));
		 $caller= $ENV{"REMOTE_HOST"};
		 print LOG "$date\t$caller\t$sublib\t-9\n";
		 close(LOG);
		}
	goto SEQSEARCH;
}


$srsq= &BuildQuery( $liblist, $query1, $or1, $query2);

if (! $inputseq) { die "No input sequence." }
if (! $srsq) { die "No subset query." }
$slen= length($inputseq);

#
# Do analysis
#

$_= $srsq; s/\\//g; ## strip \ for display
print <<TEOF
<b>$title</b>

<i>Subset selection query:</i>
   $_
<i>Sequence name:</i>
   $seqname
<i>Similarity search function:</i>
   $searchapp

TEOF
;


# call srs/getz w/ -fse subset list option
&SRSQuery( $srsq, $subsetlib, $subsetoutput);


## ?? Test $subsetoutput for line count - is it too big to allow on exp. server?

($libcount, $rest) = split(' ', `wc $subsetoutput`);
if (open(LOG,">> $logfile")) {
 chop( $date = &ctime(time));
 $caller= $ENV{"REMOTE_HOST"};
 print LOG "$date\t$caller\t$srsq\t$libcount\n";
 close(LOG);
}

if ($libcount > $maxlib) {
 print <<TEOF

  This experimental server is currently limited to small data libraries.
  Large libraries will take too much time to process at present. 
  Your query generated $libcount entries. The maximum allowed now is $maxlib.
  Try to focus your keyword query more, to produce a smaller subset library.
TEOF
;
 exit(0);
}




SEQSEARCH:

open(SEQ, ">$seqfile");
print SEQ ">$seqname\n";
print SEQ "$inputseq\n";
close(SEQ);
# $seqtype -- do what with it?

# call (t)fasta search app
$ENV{'LIBTYPE'} = "12";  # select subset libtype
$fopts = "-Q -h";   
$cmd= "$seqsearchpath$searchapp $fopts $seqfile $p$subpath$subsetlib 12$p $ktup";
#print "$cmd\n";
@searchresult = `$cmd`;


#
#  Output results
#

if (!$dohtml) {
  print @searchresult;
  }
else {
  # do href markup of searchresult to link sequence matches to databanks
  $bestmark= 0;
  foreach (@searchresult) {
	
		if ($bestmark < 1) {
			if (/The best scores are:/) { $bestmark= 1; print "<hr>\n"; }
			print;
			}
				
		elsif ($bestmark == 1) {
			if (/^\w/) {
				($name, $rest) = split(/ /,$_,2);
	  		($lib,$id) = split(/:/,$name,2);
	  		if (!$lib) { $lib= $libname; $id= $name; }
	   		$href= sprintf("$srsIdLinkFmt ", $lib, $id, $name);
		    print $href, $rest;
		    $links{$name} = $href;
		    }
	 		else { $bestmark= 2; print; }
			}
				
		else {
			if (/^\w+\:\w+ /) {
		 	  ($name, $rest) = split(/ /,$_,2);
		  	$href= $links{$name};
			  if ($href) { print "<hr>", $href, $rest; }
		   	else { print;}
		 		}
			else { print;}
			}
		}
}
  
  
# optionally provide $subsetoutput

if ($dohtml) { print "\n</pre></body>\n"; }

unlink($seqfile);
unlink($subsetoutput);
unlink("$subpath$subsetlib");

exit(0);

#-------------------------------


sub unescapeUrl
{
	## convert "%7Cbob%3c+%5Cjoe%3a" to "|bob< \joe:";
	local($_) = @_;
	s/\+/ /g;    # undo space-escape before hexes
	s/\%(..)/pack(H2,$1)/eg;
	$_;
}

sub CheckLoad
{
	$_ = `$sysloadaverage`;
	if (/load average: ([\d\.]+), ([\d\.]+), ([\d\.]+)/) {
	  $load1= $1;  $load5= $2;  $load15= $3;
	  ##print "sys loads: $load1 -- $load5 -- $load15 \n";
	  if ($load1 > $maxload) {
	  	print <<TEOF
\nThis server is too busy now.  Maximum load is $maxload.
Current load average: $load1 (1-min), $load5 (5-min), $load15 (15-min)
\nPlease try your search later.
TEOF
;
	  	exit(0);
	  	}
		}
}


sub BuildQuery
{
	local($liblist, $query1, $or1, $query2) = @_;
	local( $_);
	
	if (!$query1 && !$query2) { return ""; }
#
# new form of getz query, use '-l "lib1 lib2" [sq-class:terms]'
#
	$_ = "-l $q$liblist$q ";
	if ($query1) { 
	  $query1 =~ s/ //g; # squeeze spaces
	  $_  .= "[SQ-$class1:$query1]"; 
	  if ($query2) { 
			if ($or1 =~ /and/) { $_ .= '&'; }
			elsif ($or1 =~ /or/) { $_ .= '|'; }
			elsif ($or1 =~ /not/) { $_ .= '!'; }
	  	} 
	  }
	if ($query2) { 
	  $query2 =~ s/ //g; # squeeze spaces
	  $_ .= "[SQ-$class2:$query2]"; 
	  }
	
#
# escape some shell symbols ! or pass to (w)getz via ENV?
#
	s/\&/\\\&/g;
	s/\|/\\\|/g;
	s/\!/\\\!/g;

	return $_;    
}



sub SRSQuery {

	local($params, $subsetlib, $outfile) = @_;
	$srsapp= "getz"; # or "wgetz" ??
	
	$ENV{'SRSROOT'} = $SRSROOT;
	
	# find out operating system and define a symbolic name
        $OS = `uname`;
        if ($OS == 'SunOS' && `uname -r` =~ /^[56].*/) { $OS = 'Solaris'; }
	
	$_= $OS;
	OSCASE: {
		if (/AIX/)    { $OS_SPECIFIC = "aix"; last OSCASE; }
		if (/HP-UX/)  { $OS_SPECIFIC = "hpux"; last OSCASE; }
		if (/IRIX/)   { $OS_SPECIFIC = "irix"; last OSCASE; }
		if (/IRIX64/) { $OS_SPECIFIC = "irix64"; last OSCASE; }
		if (/IRIX/)   { $OS_SPECIFIC = "irix"; last OSCASE; }
		if (/SunOS/)  { $OS_SPECIFIC = "sunos"; last OSCASE; }
		if (/Solaris/){ $OS_SPECIFIC = "solaris"; last OSCASE; }
		if (/OSF1/)   { $OS_SPECIFIC = "osf"; last OSCASE; }
		if (/ULTRIX/) { $OS_SPECIFIC = "ultrix"; last OSCASE; }
		die "Unsupported operating system: $OS";
	}
	
	# SRS internal directories
	
	$ENV{'SRSDAT'} = "$SRSROOT/etc";
	$ENV{'SRSSDL'} = "$SRSROOT/odd";
	$ENV{'SRSSOU'} = "$SRSROOT/src";
	$ENV{'SRSCOM'} = "$SRSROOT/etc";
	$ENV{'SRSETC'} = "$SRSROOT/etc";
	$ENV{'SRSTEMP'} = "$SRSROOT/tmp";
	$ENV{'SRSWWW'} = "$SRSROOT/www";
	$ENV{'SRSWWWTMP'} = "$SRSROOT/www/tmp";
	$ENV{'SRSINX'} = "$SRSROOT/index";
	$ENV{'SRSSEC'} = "$SRSROOT/etc/$OS_SPECIFIC";
	$SRSEXE = "$SRSROOT/bin/$OS_SPECIFIC";
	$ENV{'SRSEXE'} = "$SRSEXE";
	
	# run getz 
	#set noglob = turn off file name expansion...[]!
	#print("set noglob; nice $SRSEXE/$srsapp -fse $subsetlib $params > $outfile\n");
	system("set noglob; nice $SRSEXE/$srsapp -fse $subsetlib $params > $outfile");
}


# try wgetz instead of getz...  this is input form for wgetz -
# libList=GENBANK&retrieveFieldName1=Organism&retrieveStr1=drosophilidae
# &retrieveFieldName2=AllText&retrieveStr2=esterase&retrieveFieldName3=AllText
# &retrieveStr3=
# &retrieveFieldName4=AllText&retrieveStr4=&makeWild=on&operator=AND
# &userId%7C%7CdoQuery=810692647_129.79.18.11%7C%7Cyes&seqFormat=PIR
# &entryType=entries&listEntriesChunkSize=50&viewEntriesChunkSize=10




.
