#!/usr/bin/perl
#
#    Copyright 2004 by Jason Stover.
#
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation; either version 2 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
#    02111-1307  USA
#
sub is_good
{
    $x = shift @_;
    $okee_dokee = 1;
    if($x =~ /(\<)|(\>)/)
    {
	$okee_dokee = 0;
    }
    if($x =~ m/-{2,}/)
    {    
	$okee_dokee = 0;
    }
    if( $x =~ /\<\/span>/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /\<.+\>/)
    {
	$okee_dokee = 0;
    }
    if( $x =~ /\<em\>/)
    {
	$okee_dokee = 0;
    }
    return $okee_dokee;
}
$kappa=7;
$lag=10;
$booknum = shift(@ARGV);
$sequencelength=2*$kappa+$lag-1; # Length of the sequence to pass to wordsense.lisp.
$middle = -1+($sequencelength +1)/2;
$word='fruit';
$singleline='';
while($line=<>)
{
    chomp $line;
    $line =~ s/&\#146;s/\ts\t/g;
    $line =~ s/&\#146;//g;
    $line =~ s/[,;]/\t/gi;
    $line =~ s/(&quot;)|(<p>)|(<\/p>)|(<i>)|(<\/i>)|(&\#148;)|(&\#151;)|(&\#145;)|(&\#147;)/\t/g;
    $singleline = join ' ',$singleline,$line;
}
@Words = split ' ',$singleline;
%Wordtables = ();
# Wordtables' elements are array refs. [n00,n01,n10,n11], where
# nij = count for $word vs. $key, 0 = no, 1 = yes. The first subscript refers
# to $word, the second to $key.
$i = 0;
while(@Words)
{
    $x = lc(shift @Words);
    $x =~ s/\.//g;
    if(!exists($Wordtables{$x}))
    {
	$Wordtables{$x} = [0,0,0,0];
	$i++;
    }   
}
@Sentences=split (/[\.\?\!]/, $singleline);
foreach $s (@Sentences) 
{
    $s =~ s/[\.\?\!]//g;
    foreach $w (keys(%Wordtables))
    {
	$arrayref = $Wordtables{$w};
	if( $s =~ /\s$word\s/i)
	{
	    if($s =~ /\s$w\s/i)
	    {
		$$arrayref[3] += 1;
	    }
	    elsif ($s =~ /^$w\s/)
	    {
		$$arrayref[3] += 1;
	    }
	    elsif ( $s =~ /$w$/)
	    {
		$$arrayref[3] += 1;
	    }
	    else
	    {
		$$arrayref[2] += 1;
	    }
	}
	else
	{
	    if ( $s =~ /\s$w\s/i)
	    {
		$$arrayref[1] += 1;
	    }
	    elsif ($s =~ /^$w\s/)
	    {
		$$arrayref[1] += 1;
	    }
	    elsif ( $s =~ /$w$/)
	    {
		$$arrayref[1] += 1;
	    }
	    else
	    {
		$$arrayref[0] += 1;
	    }
	}
    }
}
# Now that we have the tables, compute either the chi-square or
# Fisher's exact p-value... But we don't need those 'filler' words
# from sentences that don't contain our word of interest, so why
# not build the contingency tables straight from the concordances?