#!/usr/bin/perl # # Copyright 2004 by Jason Stover. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # 02111-1307 USA # sub is_good { $x = shift @_; $okee_dokee = 1; if($x =~ /(\<)|(\>)/) { $okee_dokee = 0; } if($x =~ m/-{2,}/) { $okee_dokee = 0; } if( $x =~ /\<\/span>/) { $okee_dokee = 0; } if( $x =~ /\<.+\>/) { $okee_dokee = 0; } if( $x =~ /\/) { $okee_dokee = 0; } if( $x =~ /onmouseover/i) { $okee_dokee = 0; } if( $x =~ /popDown/) { $okee_dokee = 0; } if( $x =~ /overlib/ ) { $okee_dokee = 0; } if ($x =~ /popUp/) { $okee_dokee = 0; } if ($x =~ /\W/ ) { $okee_dokee = 0; } ##### Use this to drop common words############ # $x = lc($x); # if ($pvals{$x} > .05 ) # { # $okee_dokee = 0; # } ########################################## return $okee_dokee; } $kappa=7; $lag=10; $booknum = shift(@ARGV); $sequencelength=2*$kappa+$lag-1; # Length of the sequence to pass to wordsense.lisp. $middle = -1+($sequencelength +1)/2; ###################################################### # Change $word to be whatever word for which you want # KWIC concordances. $word=shift(@ARGV); ###################################################### $singleline=''; while($line=<>) { chomp $line; $line =~ s/&\#146;s/\ts\t/g; $line =~ s/&\#146;//g; $line =~ s/[\!\:\?;,\.\-]/\t/gi; $line =~ s/(")|(

)|(<\/p>)|()|(<\/i>)|(&\#148)|(&\#151)|(&\#145)|(&\#147)/\t/g; $line =~ s/\sit\'s\s/\sit\tis\s/g; $line =~ s/\'s/\ts/g; $line =~ s/\'//g; $line =~ s/\`//g; $line =~ s/\)/\s/g; $line =~ s/\(/\s/g; $line =~ s/Chapter\s\d{1,2}\s//; $line =~ s/size=//; $singleline = join ' ',$singleline,$line; } #################### # The following file contains p values for each of the words in the texts. # These p values will be used to exclude common words. # Dropping these common words gives lousy results. Boxy histograms, etc. #open(PVALS,") #{ # chomp $line; # ($key,$pval) = split ' ',$line; # $pvals{$key} = $pval; #} #close(PVALS); ###################### @Line=split(' ',$singleline); #%Wordhash = (); #$nextcode = 1; # Code 0 is for words which do not depend on context. # Encode the words so we don't use too much disk space. # @Coded_words = (); # for ($i=0;$i$kappa) { if($Current[$middle] eq $word) { $concordance = join "\t",@Current; print $word,"\t",$booknum,"\t",$concordance,"\n"; } $tmp = shift @Line; if(is_good($tmp)) { shift(@Current); push(@Current,$tmp); } }