#!/usr/bin/perl
#
#    Copyright 2004 by Jason Stover.
#
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation; either version 2 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
#    02111-1307  USA
#
sub is_good
{
    $x = shift @_;
    $okee_dokee = 1;
    if($x =~ /(\<)|(\>)/)
    {
	$okee_dokee = 0;
    }
    if($x =~ m/-{2,}/)
    {    
	$okee_dokee = 0;
    }
    if( $x =~ /\<\/span>/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /\<.+\>/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /\<em\>/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /onmouseover/i)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /popDown/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /overlib/ )
    {
	$okee_dokee = 0;
    }
    if ($x =~ /popUp/)
    {
	$okee_dokee = 0;
    }
    if ($x =~ /\W/ )
    {
	$okee_dokee = 0;
    }
##### Use this to drop common words############
#    $x = lc($x);
#    if ($pvals{$x} > .05 )
#    {
#	$okee_dokee = 0;
#    }
##########################################
    return $okee_dokee;
}
$kappa=7;
$lag=10;
$booknum = shift(@ARGV);
$sequencelength=2*$kappa+$lag-1; # Length of the sequence to pass to wordsense.lisp.
$middle = -1+($sequencelength +1)/2;
######################################################
# Change $word to be whatever word for which you want
# KWIC concordances.
$word=shift(@ARGV);
######################################################
$singleline='';
while($line=<>)
{
    chomp $line;
    $line =~ s/&\#146;s/\ts\t/g;
    $line =~ s/&\#146;//g;
    $line =~ s/[\!\:\?;,\.\-]/\t/gi;
    $line =~ s/(&quot)|(<p>)|(<\/p>)|(<i>)|(<\/i>)|(&\#148)|(&\#151)|(&\#145)|(&\#147)/\t/g;
    $line =~ s/\sit\'s\s/\sit\tis\s/g;
    $line =~ s/\'s/\ts/g;
    $line =~ s/\'//g;
    $line =~ s/\`//g;
    $line =~ s/\)/\s/g;
    $line =~ s/\(/\s/g;
    $line =~ s/Chapter\s\d{1,2}\s//;
    $line =~ s/size=//;
    $singleline = join ' ',$singleline,$line;
}
####################
# The following file contains p values for each of the words in the texts.
# These p values will be used to exclude common words. 
# Dropping these common words gives lousy results. Boxy histograms, etc.
#open(PVALS,"</mnt/biafra/jason/var/p-val-hash.txt") or die "error: cannot open file\n";
#%pvals=();
#while($line=<PVALS>)
#{
#    chomp $line;
#    ($key,$pval) = split ' ',$line;
#    $pvals{$key} = $pval;
#}
#close(PVALS);
######################
@Line=split(' ',$singleline);

#%Wordhash = ();
#$nextcode = 1; # Code 0 is for words which do not depend on context.
# Encode the words so we don't use too much disk space.
# @Coded_words = ();
# for ($i=0;$i<scalar(@Line1);$i++)
# {
#     $tmp = lc($Line1[$i]);
#     if( is_good($tmp))
#     {
# 	if (!(exists $Wordhash{$tmp}))
# 	{
# 	    $Wordhash{$tmp} = $nextcode;
# 	    $nextcode++;
# 	}
# 	push @Coded_words,$Wordhash{$tmp};
#     }
# }
$start=$kappa;
@Current = ();
$i=0;
while($i<$sequencelength)
{
    $tmp = shift @Line;

    if(is_good($tmp))
    {
	push (@Current, $tmp);
	$i++;
    }
}
while(scalar(@Line)>$kappa)
{
    if($Current[$middle] eq $word)
    {
	$concordance = join "\t",@Current;
	print $word,"\t",$booknum,"\t",$concordance,"\n";
    }
    $tmp = shift @Line;
    if(is_good($tmp))
    {
	shift(@Current);
	push(@Current,$tmp);
    }
}

