#!/usr/bin/perl
#
#    Copyright 2004 by Jason Stover.
#
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation; either version 2 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
#    02111-1307  USA
#
#
# This file was used to create the final data file /mnt/biafra/var/concordances.lisp
# from the text file ~/tmp/concordances3.txt. concordances3.txt was created
# with get-concordance.pl and get-concordances.sh from the original data
# stored in ~/var/corpi/fiction/cd1.
#
#
# To make this work better, create partial matches by allowing similar
# words to match, e.g. allow 'is' and 'am' to give a value of .5
# instead of 1 since both are conjugations of the same infinitive. This
# will require a lot of typing.

########################################
# Contains array refs. of words that can
# be considered partial matches.
# In addition, create partial matches for words that match
# with the exception of a prefix/suffix.
@Matchgroups = (['is','be','am','are','was','were','been','being','become'],
		['may','might','must'],
		['cant','could','should','would','cannot'],
		['shall','will','wont'],
		['do','does','did'],
		['have','has','had'],
		['he','him','his','she','her','hers','they','them','their','it','its','himself','herself','themselves'],
		['you','your','yours','yourself'],
		['i','me','mine','myself'],
		['us','we','our','ours','ourselves'],
		['this','that','these','those'],
		['here','there'],
		['of','for'],
		['around','about','above','below','over','under','beside','beneath','upon'],
		['in','inside','out','outside','within'],
		['with','against','without'],
		['by','through'],
		['a','an','the'],
		['like','such', 'as'],
		['though','although','despite','nevertheless','however'],
		['and','but','or'],
		['who','what','where','why','how','which']);
#########################################################
## A key in Partial_matches is a word. It points to 
## an array of words that are that key's partial-matching
## words.
%Partial_matches = ();
@Suffixes = ('ing','ed','ly','s','es','t','d','ion',
	     'er','ment','able','ible','age','al','ance',
	     'ence','ant','ent','ess','ful','hood',
	     'ian','ical','ify','ish','ist','ive','ise',
	     'less','like','ness','or','ous','ship',
	     'wards','ward','wise','y');
@Prefixes = ('un','pre','dis','anti','auto','bi',
	     'co','con','contra','de','ex','fore',
	     'in','il','im','ir','inter','kilo',
	     'micro','mid','mini','multi','mono',
	     'mis','non','out','over','post','pro',
	     're','semi','sub','super','trans','tri',
	     'ultra','under');
sub Match
{
    my $word1 = shift @_;
    my $word2 = shift @_;
    if($word1 =~ /^\w{1,5}$word2$/)
    {
	foreach $prefix (@Prefixes)
	{
	    if($word1 =~ /^$prefix$word2/)
	    {
		return 1;
	    }
	}
    }
    if($word1 =~ /^$word2\w{1,5}$/) 
    {
	foreach $suffix (@Suffixes)
	{
	    if($word1 =~ /$word2$suffix$/)
	    {
		return 1;
	    }
	    elsif(length($word2)>length($suffix))
	    {
		my @Root = split '',$word2;
		pop @Root; 
		my $root = join '',@Root;
		if($word1 =~ /^$root$suffix$/)
		{
		    return 1;
		}
		pop @Root;
		my $root = join '',@Root;
		if($word1 =~ /^$root$suffix$/)
		{
		    return 1;
		}
	    }
	}
    }
    return 0;
}
    
@Lines=();
%Wordhash=();
$nextcode=0;
$seed=0;
while($line=<>)
{
    chomp($line);
    $line =~ s/\&\#\d\d\d//g;
    $line =~ s/\<\/span>//g;
    $line =~ s/\<.+\>//g;
    $line =~ s/\<em\>//g;
    $line =~ s/[\\\[\]\;\?\.\!\,\'\`\"\:\(\)\-]//g;
    $line =~ s/\trage\t/\tragefury\t/g;
    $line =~ s/\tfury\t/\tragefury\t/g;
    @tmp = split ' ',$line;
    $tmp2 = join "\t",@tmp;
#    if($tmp[13] eq 'bank')
#    {
#	$preserved = shift @tmp;
#	$tmp2 = join "\t",@tmp;
#	$tmp2 =~ s/\tfruit\t/\tfruitdoor\t/g;
#	$tmp2 =~ s/\tdoor\t/\tfruitdoor\t/g;
#	$tmp2 = join "\t",$preserved,$tmp2;
	push @Lines,$tmp2;
#    }
}
print "(setf concordances '(";
while(@Lines)
{

    $line1 = shift(@Lines);
    @Line1 = split ' ',$line1;

    $realword = shift(@Line1);
    $corp1 = shift(@Line1);
# Encode the words so we don't use too much disk space.
    for ($i=0;$i<scalar(@Line1);$i++)
    {
	$tmp = lc($Line1[$i]);
	if (!(exists $Wordhash{$tmp}))
	{
	    $Wordhash{$tmp} = $nextcode;
	    $nextcode++;
	}
	$Line1[$i]=$Wordhash{$tmp};
    }
    $line1 = join "\t",@Line1;
    print "((",$realword,")\t(",$corp1,")\t(",$line1,")) ";
}
print "))\n";
#########################################
# Get the 'ing' and 'ed', etc. endings.
@Keys = keys(%Wordhash);
while(@Keys)
{
    $key1 = shift(@Keys);
    if(length($key1)>3)
    {
	foreach $key2 (@Keys)
	{
	    if(length($key2) > 3)
	    {
		if(Match($key1,$key2))
		{
		    if(exists($Partial_matches{$key1}))
		    {
			push @{$Partial_matches{$key1}},$key2;
		    }
		    else
		    {
			$Partial_matches{$key1} = [$key2];
		    }
		}
		elsif(Match($key2,$key1))
		{
		    if(exists($Partial_matches{$key1}))
		    {
			push @{$Partial_matches{$key1}},$key2;
		    }
		    else
		    {
			$Partial_matches{$key1} = [$key2];
		    }
		}
	    }
	}
    }
}
######################################
# Match pronouns and common verbs.
foreach $arref (@Matchgroups)
{
    foreach $x (@{$arref})
    {
	$Partial_matches{$x} = $arref;
    }
}
print "(defvar *partial-matches* (make-hash-table))\n";
foreach $key (keys(%Partial_matches))
{
    $wordnum = $Wordhash{$key};
    @word_pals = @{$Partial_matches{$key}};
#    for ($i = 0; $i < scalar(@word_pals); $i++)
#    {
#	$tmp = $word_pals[$i];
#	$word_pals[$i] = $Wordhash{$tmp};
#    }
    print "(setf (gethash ",$key," *partial-matches*) '(",(join ' ',@word_pals),"))\n";
}

###########################################
# Print the word hash so we can map numbers 
# back to words later.
foreach $key (keys(%Wordhash))
{
    print STDERR $key,"\t",$Wordhash{$key},"\n";
}

#foreach $key (keys(%Wordhash))
#{
#    print $key,"\t",$Wordhash{$key},"\n";
#}
#    foreach $x (@Lines)
#    {
#	@Line2 = split ' ',$x;
#	$realword2 = shift(@Line2);
	# Sample here: we need all the cases in which the concordances agree, but
	# only a few in which they disagree on the target word.
#	if(!@Rand_state)
#	{
#	    $rand_state =  `export GSL_RNG_SEED=$seed;/mnt/biafra/jason/src/compling/a.out`;
#	    @Rand_state = split ' ',$rand_state;
#	    $seed = pop @Rand_state;
#	}
#	$rn = shift @Rand_state;
#	if((lc($realword2) eq 'fruit') || (lc($realword) eq 'fruit'))
#	{
# 	    $corp2 = shift(@Line2);
# 	    for ($i=0;$i<scalar(@Line2);$i++)
# 	    {
# 		$tmp = $Line2[$i];
# 		if (!(exists $Wordhash{$tmp}))
# 		{
# 		    $Wordhash{$tmp} = $nextcode;
# 		    $nextcode++;
# 		}
# 		$Line2[$i]=$Wordhash{$tmp};
# 	    }
# 	    $line2 = join "\t",@Line2;
# 	    print "(",$realword,")\t(",$realword2,")\t(",$corp1,")\t(",$corp2,")\t(",$line1,")\t(",$line2,")\n";
# 	}
#     }
#}