;;
;;    Copyright 2004 by Jason Stover.
;;
;;    This program is free software; you can redistribute it and/or
;;    modify it under the terms of the GNU General Public License as
;;    published by the Free Software Foundation; either version 2 of the
;;    License, or (at your option) any later version.
;;
;;    This program is distributed in the hope that it will be useful,
;;    but WITHOUT ANY WARRANTY; without even the implied warranty of
;;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;    GNU General Public License for more details.
;;
;;    You should have received a copy of the GNU General Public License
;;    along with this program; if not, write to the Free Software
;;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
;;    02111-1307  USA
;;
;; Not sure what these values should be.
(defparameter *kappa* 7)
(defparameter *threshold* 1)
(defparameter *lag* 11)
(defparameter *markov-partition* 5) ;This is the window length.
                                    ;I have called it *markov-partition*
                                    ;to remind myself that I got the idea from
                                    ;dynamical systems.
(defparameter *theta* .8)
;;
;; Count the number of matches which were not -1's.
(defvar *not-minus1-match* 0)
(defvar *not-minus1-not-match* 0)
(defvar *minus1-match* 0)
(defvar *minus1-not-match* 0)
;;
;; Elementwise distance will accept a list of length *five-bound*
;; (the variable name was taken from the meaning of this bound as given
;; in Lalley, "Beneath the Noise, Choas", Annals of Statistics, circa 2000.)
;; Note we permute the list to allow more matches. This is equivalent to finding
;; to orbits sets that 'shadow' the point of interest on the corresponding manifold.
;; If we take a string of words to be an observed Markov partition, then
;; the noise added on the manifold is equivalent to perturbing the order of the 
;; observed Markov partitions. So permuting the words is just the way of finding
;; which values shadow each other, assuming the diameter of a set in the Markov 
;; partition is bounded by c*|noise| (measured on the manifold).
;;
;; This raises the question: How far apart two sentences can be before we
;; can say they 'shadow' each other. Barring the proof of an amazing theorem, 
;; the only way to answer this is to experiment.
(defun elementwise-distance (x y)
  (let ((y-copy y)
	(nonmatches 0)
	(x-len (length x)))
    (loop (when (endp y-copy) (return nonmatches))
	  (let ((nexty (pop y-copy)))
	    (if (member nexty x) t (setf nonmatches (+ nonmatches 1)))))
    (loop (when (endp x) (return (/ nonmatches (+ x-len (length y)))))
;    (loop (when (endp x) (return nonmatches)) ; No division here, just return the integer
	  (let ((nextx (pop x)))
	    (if (member nextx y) t (setf nonmatches (+ nonmatches 1)))))))

(defun partial-match-p (x y)
  ;; x is a token, y is a list of tokens
  (let ((non-match 1)
	(nexty (pop y)))
    (loop (when (or (endp y) (not non-match)) (return non-match))
	  (if (or (member nexty (gethash x *partial-matches*))
		  (member x (gethash nexty *partial-matches*))
		  (eq x nexty))
	      (setq non-match 0) t)
	  (setq nexty (pop y)))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This is elementwise-distance modified to allow partial matches.
;; In elementwise-distance, two tokens either match or not. Those that do
;; not match contribute a 1 to the distance. Here, elements that match partially
;; contribute a 1/2 to the distance.
(defun partial-match-elementwise-distance (x y &optional (result (list)) (subscript 0))
  (if (endp x) result
    (let ((sub-y (do* ((y-subscr (max 0 (floor (- subscript (/ (- *markov-partition* 1) 2))))
				 (incf y-subscr))
		       (z (list (nth y-subscr y)) (push (nth y-subscr y) z)))
		      ((eq y-subscr 
			   (min (length y) (floor (+ subscript (/ (- *markov-partition* 1) 2))))) 
		       z))))
      (partial-match-elementwise-distance (cdr x) y 
					  (push (partial-match-p (car x) sub-y) result) (incf subscript)))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Don't actually compute the elementwise-distance, just count
;; the matches of different types.
(defun count-matches (x y)
  (let ((y-copy y)
	(nonmatches 0)
	(x-len (length x)))
    (loop (when (endp y-copy) (return nonmatches))
	  (let ((nexty (pop y-copy)))
  	    (cond ((and (not (member nexty x)) (not (eq -1 nexty))) (incf *not-minus1-not-match*))
		  ((and (not (member nexty x)) (eq -1 nexty)) (incf *minus1-not-match*))
		  ((and (member nexty x) (eq -1 nexty)) (incf *minus1-match*))
		  ((and (member nexty x) (not (eq -1 nexty)) (incf *not-minus1-match*))))
	    (if (member nexty x) t (setf nonmatches (+ nonmatches 1)))))
    (loop (when (endp x) (return nonmatches))
	  (let ((nextx (pop x)))
  	    (cond ((and (not (member nextx y)) (not (eq -1 nextx))) (incf *not-minus1-not-match*))
		  ((and (not (member nextx y)) (eq -1 nextx)) (incf *minus1-not-match*))
		  ((and (member nextx y) (eq -1 nextx)) (incf *minus1-match*))
		  ((and (member nextx y) (not (eq -1 nextx)) (incf *not-minus1-match*))))
	    (if (member nextx y) t (setf nonmatches (+ nonmatches 1)))))))

;; 
;; Get the distance between two sequences.
(defun fill-partition (group &optional (result (list)))
  (if (= (length result) *markov-partition*) result
    (fill-partition (cdr group) (push (pop group) result))))
(defun update-exponent (current prior)
  (if (= 0 current) (if (= -1 prior) 0 (1+ current))
    (1+ current)))

(defun symmetric-distance (x y)
  (let* ((dist 0)
	 (x-sublist ())
	 (y-sublist ())
	 (nextx 0)
	 (nexty 0)
	 (exponent (- (1+ (- (/ (- (length x) 1) 2) (ceiling (/ (1+ *markov-partition*) 2))))))
	 (prior (- exponent 1)))
    (loop 
	  (setq x-sublist (fill-partition x))
	  (setq y-sublist (fill-partition y))
	  (when (< (length y) *markov-partition*) (return dist))
	  (setq dist (+ dist (* (expt .5 (abs exponent)) (elementwise-distance x-sublist y-sublist))))
	  (setq nextx exponent)
	  (setq exponent (update-exponent exponent prior))
	  (setq prior nextx)
	  (setq nexty (pop y))
	  (setq nextx (pop x)))))
;;
;; Distance will use larger exponents before the middle word. It also drops the
;; first word in the concordance.
(defun distance (x y)
  (let ((dist1 (partial-match-elementwise-distance x y))
	(dist2 (partial-match-elementwise-distance y x)))
    (do* ((z (/ (- (length x) 1) 2) (decf z))
	  (exponent (abs z) (abs z))
	  (dist 0 (setf dist (+ dist (* (expt *theta* exponent) (+ (pop dist1) (pop dist2)))))))
	 ((endp dist1) dist))))

(defun count-match-dist (x y)
  (let ((dist 0)
        (exponent (/ (- (length x) 3) 2)))
    (loop (setq x-sublist ())
          (setq y-sublist ())
          (multiple-value-bind (y-sublist x-sublist)
                               (let ((y-copy y)
                                     (x-copy x))
                                 (loop (when (= (length y-sublist) *markov-partition*) 
					 (return (values y-sublist x-sublist)))
                                       (push (pop y-copy) y-sublist)
                                       (push (pop x-copy) x-sublist))))
          (setq nexty (pop y))
          (setq nextx (pop x))
          (when (< (length y) *markov-partition*) (return dist))
          (setq exponent (- exponent 1))
          (setq dist (+ dist (* (expt *theta* (abs exponent)) (count-matches x-sublist y-sublist)))))))

;;; (defun distance (x y)
;;;   (let* ((x-sublist ());
;;; 	 (y-sublist ())
;; 	 (dist 0)
;; 	 (exponent (/ (- (length x) *markov-partition*) 2)))
;;     (loop (setq x-sublist (fill-partition x))
;; 	  (setq y-sublist (fill-partition y))
;; 	  (when (< (length y) *markov-partition*) (return dist))
;; 	  (setq dist (+ dist (* (expt .5 (abs exponent)) (elementwise-distance x-sublist y-sublist))))
;; 	  (setq exponent (- exponent 1))
;; 	  (pop x)
;; 	  (pop y))))
;;
;; Compute the usual mean distance, rather than the sum with the powers of 2 in
;; the denominator. Unlike the distance function above, this does not give normally 
;; distributed data.
(defun mean-distance (x y)
  (let* ((x-sublist ())
	 (y-sublist ())
	 (dist 0)
	 (denom (length x)))
    (loop (setq x-sublist (fill-partition x))
	  (setq y-sublist (fill-partition y))
	  (when (< (length y) *markov-partition*) (return dist))
	  (setq dist (+ dist (/ (elementwise-distance x-sublist y-sublist) denom)))
	  (if (= (length y) (length x)) (pop x) t)
	  (pop y))))

(defun build-sequence (x &optional (result ()))
  (if (eq (length x) (1+ (* 2 *lag*))) result
    (build-sequence (cdr x) (push (car x) result))))
;;
;; Do the orbits of the middle two words of each sequence shadow each other?
;; Return a list containing the distances between x_{i+n} and x_{j+n} for 
;; n= -kappa,..., 0,..., kappa. This function has no knowledge of the value
;; of kappa, so the caller must form concordances of the right length.
(defun distance-sequence (x y &optional (result ()))
  (if (eq (length result) (- (length x) (1+ (* 2 *lag*))))
      (let* ((seq-x (build-sequence x))
	     (seq-y (build-sequence y))
	     (result (push (distance seq-x seq-y) result)))
	(distance-sequence (cdr x) (cdr y) result))))

;;
;; The keys are words, the values stored are contingency tables.
;; The table is stored in a list (n0f n0d n1f n1d) where the
;; first subscript refers to the word, the second subscript refers
;; to either 'fruit' or 'door'. So n0f = number of concordances which
;; have 'fruit' but not the word, etc. 
(defun make-wordlist (x)
  (let ((wordlist (list)))
    (dolist (y x wordlist)
      (dolist (token (third y) wordlist) (if (member token wordlist) t (push token wordlist))))))

(defun initialize-word-hash (wordlist)
  (let ((word-hash (make-hash-table)))
    (dolist (word wordlist word-hash)
      (setf (gethash word word-hash) (list 0 0 0 0)))))
	
(defun update-word-hash (concordance word word-hash)
  (if (eq (caar concordance) 'fruit) 
      (if (member word (third concordance))
	  (incf (third (gethash word word-hash)))
	(incf (first (gethash word word-hash))))
    (if (member word (third concordance))
	(incf (fourth (gethash word word-hash)))
      (incf (second (gethash word word-hash))))))

(defun fill-word-hash (x wordlist word-hash)
  (dolist (word wordlist word-hash)
    (dolist (concordance x word-hash)
      (update-word-hash concordance word word-hash))))


(defun make-word-hash (x)
  (let* ((words (make-wordlist x))
	 (word-hash (initialize-word-hash words)))
    (fill-word-hash x words word-hash)))


;;
;; Count the number of tokens that will be dropped when finding their
;; signifigances in the word hash.
(defun count-lost-tokens (hash x)
  (do* ((conc (pop x) (pop x))
	(lost-tokens 0 (+ lost-tokens (dolist (token (third conc) lost-tokens) (if (< (second (gethash token hash)) .05) (1+ lost-tokens) t))))
	(total-tokens 0 (+ total-tokens (- (length (third conc)) 1))))
       ((null x) lost-tokens)))

;;
;; Replace the extremely uncommon words. This will keep common words that depend on
;; context (which will result in matches within a sense but not without) and "context
;; independent" words as measured by Fisher's test, which might be used differently based
;; on distance to the center.
(defun make-word-frequencies (x)
  (let* ((wordlist (make-wordlist x))
	 (word-freqs (make-hash-table))
	 (word-freqs (dolist (word wordlist word-freqs) (setf (gethash word word-freqs) 0))))
    (dolist (record x word-freqs) 
      (dolist (word (third record) word-freqs)
	(incf (gethash word word-freqs))))))
;;
;; Replace the rare words in the hash with -1's.
(defun replace-hash (x hash cutoff)
  (let* ((wordlist (make-wordlist x))
	 (sortedwords (sort wordlist #'< :key #'(lambda (y) (gethash y hash))))
	 (n-occurrences (* (length x) (length (third (first x)))))
	 (p-val 0))
    (loop (when (> p-val cutoff) (return hash))
	  (setq next-word (pop sortedwords))
	  (setq p-val (+ p-val (/ (gethash next-word hash) n-occurrences)))
	  (setf (gethash next-word hash) -1))))
(defun replace-rare-words (x hash)
  (let ((new-x (list)))
    (dolist (word (third x) new-x) (cond ((eq word 11) (push word new-x))
					 ((> 0 (gethash word hash)) (push -1 new-x))
					 (t (push word new-x))))))

;; The extension for the output file _mpk.txt refers to a run with 'markov partition' = k.
(defun outpoop-dists (x hash)
;;   (let ((files (list '/mnt/biafra/jason/var/p07.txt '/mnt/biafra/jason/var/p05.txt  '/mnt/biafra/jason/var/p04.txt '/mnt/biafra/jason/var/p025.txt '/mnt/biafra/jason/var/p01.txt))
;; 	(p-vals (list .07 .05 .04 .025 .01)))
;;     (do ((file (pop files) (pop files))
;; 	 (p-val (pop p-vals) (pop p-vals))
;; 	 (x concs concs))
;; 	((null file) t)
  (with-open-file (strm "/mnt/biafra/jason/var/dists_lag5.txt" :direction :output)
		  (do ((conc1 (pop x) (pop x)))
		      ((null x) t)
		      (dolist (conc2 x) 
			(let* ((unif (random 1.0))
			       (should-print (cond ((and (eq (caar conc1) 'door) (eq (caar conc2) 'door)
							 (< unif .0025)) t)
						   ((and (eq (caar conc1) 'fruit) (eq (caar conc2) 'fruit)))
						   (t nil))))
			  (if should-print
			      (format strm "~& ~a ~a ~a ~a" 
;				      (caar conc1) (caar conc2) (caadr conc1) 
; 				      (caadr conc2) (distance (third conc2) (third conc1)))))))))
  				      (caar conc1) (caar conc2 ) 
				      (caadr conc2) (caadr conc2) 
				      (distance  (replace-rare-words conc2 hash) 
  							       (replace-rare-words conc1 hash)))
			    t))))))
(defun outpoop-unusual-sentences (x hash)
    (with-open-file (strm "/mnt/biafra/jason/var/wierd-sentences.txt" :direction :output)
		  (do ((conc1 (pop x) (pop x)))
		      ((null x) t)
		      (dolist (conc2 x) 
			(let* ((p-val .05)
			       (dis (distance (replace-context-words conc2 hash p-val)
								      (replace-context-words conc1 hash p-val)))
			       (should-print (cond ((and (eq (caar conc2) 'fruit) (eq (caar conc1) 'fruit) 
							 (< dis .5)) t)
						   ((and (eq (caar conc2) 'fruit) (eq (caar conc1) 'fruit)
							 (> dis 4.8)) t)
						   ((and (eq (caar conc2) 'door) (eq (caar conc1) 'door) 
							 (> dis 4.8)) t)
						   ((and (eq (caar conc2) 'door) (eq (caar conc1) 'door) 
							 (< dis .35)) t)
						   (t nil))))
			  (if should-print
			      (format strm "~& ~a ~a ~a ~a ~a ~a~%" (caar conc2) (caadr conc1) (caadr conc2) 
				      dis (third conc1) (third conc2))))))))
(defun outpoop-match-stats (x hash)
  (do ((conc1 (pop x) (pop x)))
      ((null x) t)
      (dolist (conc2 x) 
	(if (and (< (random 1.0) .025) (eq (caar conc2) 'door) (eq (caar conc1) 'door))
	    (count-match-dist (replace-rare-words conc2 hash)
			      (replace-rare-words conc1 hash))
	  t)))
  (format t "~& ~a ~a ~a ~a" *not-minus1-not-match* *not-minus1-match* *minus1-not-match* *minus1-match*))

(defun get-dist-sequence (x y)
  (let* ((x-sublist ())
	 (y-sublist ())
	 (dist-seq ())
	 (priordist 10))
    (loop (setq x-sublist (fill-partition x))
	  (setq y-sublist (fill-partition y))
	  (when (< (length y) *markov-partition*) (return dist-seq))
	  (setq distn (elementwise-distance x-sublist y-sublist))
	  (push  distn dist-seq)
	  (if (and (< priordist 1) (< 9 distn))
	      (print (list (list priordist priorx priory) (list distn x-sublist y-sublist))) t)
	  (setq priordist distn)
	  (setq priorx x-sublist)
	  (setq priory y-sublist)
	  (pop x)
	  (pop y))))
(defun update-matrix (sequence matrix)
  (cond ((< (length sequence) 2) matrix)
	(t (incf (aref matrix (cadr sequence) (car sequence)))
	   (update-matrix (cdr sequence) matrix))))

(defun initialize-transit-matrix ()
  (do ((n 0 (1+ n))
       (matrix (list) (push (do ((m 0 (1+ m))
				 (row (list) (push 0 row)))
				((= m (1+ (* 2 *markov-partition*))) row)) matrix)))
      ((= n (1+ (* 2 *markov-partition*))) matrix)))
(defun get-dist-transition-matrix (concs hash)
  (let ((transit-matrix (make-array (list (1+ (* 2 *markov-partition*)) (1+ (* 2 *markov-partition*)))
				    :initial-contents (initialize-transit-matrix))))
    (do ((conc1 (pop concs) (pop concs))
	 (n 0 (1+ n)))
	((null concs) transit-matrix)
	(if (and (eq (caar conc1) 'door) (< (random 1.0) .0025))
	    (dolist (conc2 concs) 
	      (if (eq (caar conc2) 'door)
		  (let* ((conc1-replaced (replace-rare-words conc1 hash))
			 (conc2-replaced (replace-rare-words conc2 hash))
			 (dist-seq (get-dist-sequence conc2-replaced conc1-replaced)))
		    (if (and (member 0 dist-seq) (member 10 dist-seq))
			(print (list conc2-replaced conc1-replaced dist-seq)) t)
		    (setq transit-matrix (update-matrix dist-seq transit-matrix)))
		t))
	  t))))
(load '/mnt/biafra/jason/var/concordances.lisp)
;(load '/home/jason/tmp/concor-dropped.lisp)
;(setf word-hash (make-hash-table))
;(load '/mnt/biafra/jason/var/word-hash.lisp)
;(load '/mnt/biafra/jason/var/wordhash_pvalues.lisp)
;(setf doors (list))
;(setf fruits (list))
;(dolist (x concordances) (if (equal (caar x) 'door) (push x doors)
;					    (push x fruits)))

;(outpoop-unusual-sentences concordances word-hash)
(setf word-hash (make-word-frequencies concordances))
(replace-hash concordances word-hash .21)
;(setf transition (get-dist-transition-matrix concordances word-hash))
;(print transition)
;; (setf nprints 0)
;; (loop (setq conc1 (pop concordances))
;;       (when (or (null concordances) (> nprints 2000)) (return t))
;;       (if (eq (caar conc1) 'door)
;; 	  (dolist (conc2 concordances) 
;; 	    (if (eq (caar conc2) 'door)
;; 		(if (<= (distance (replace-rare-words conc1 word-hash) (replace-rare-words conc2 word-hash)) 1.8)
;; 		    (and (print (list conc1 conc2)) (incf nprints)) t) t)) t))
;(setf n-m1 0)
;(setf wordlist (make-wordlist concordances))
;(dolist (word wordlist n-m1) (if (eq (gethash word word-hash) -1) 
;				  (incf n-m1) t))
;(print (length wordlist))

(outpoop-dists concordances word-hash)

;(outpoop-match-stats concordances word-hash)
;(setf word-hash (make-word-hash concordances))
;; (with-open-file (strm "/mnt/biafra/jason/var/word-hash.lisp" :direction :output)
;; 		(do* ((key 0 (1+ key))
;; 		      (val (gethash key word-hash) (gethash key word-hash)))
;; 		     ((null val) t)
;; 		     (format strm "~& (setf (gethash ~a word-hash) ~a" key val)))

;(do* ((realword (read) (read nil nil))
;      (corp (read) (read nil nil))
;      (concordance (read) (read nil nil))
;      (conc-dat () (push 
;     ((null corp) t))
;     (setf distances (distance lst lst2))
;     (format t "~& ~a ~a ~a ~a ~a" (first realword) (first realword2) (first corp) (first corp2) distances))
;; (with-open-file (strm "/mnt/biafra/jason/var/wordhash.txt" :direction :output)
;; 		(do ((key 0 (1+ key)))
;; 		    ((null (gethash key word-hash)) t)
;; 		    (print (list key (gethash key word-hash)) strm)))

;; Were the indepenedent words the most common words?

;; (multiple-value-bind (indep-words dep-words indep-tokens dep-tokens)
;; 		     (do* ((n 0 (1+ n))
;; 			   (word (gethash n word-hash) (gethash n word-hash))
;; 			   (indep-tokens 0 (if (null word) indep-tokens 
;; 					     (if (> (second word) .05) (+ indep-tokens (third (first word))
;; 									  (fourth (first word))) indep-tokens)))
;; 			   (indep-words 0 (if (null word) indep-words 
;; 					    (if (> (second word) .05) (1+ indep-words) indep-words)))
;; 			   (dep-tokens 0 (if (null word) dep-tokens 
;; 					   (if (< (second word) .05) (+ dep-tokens (third (first word)) 
;; 								      (fourth (first word))) dep-tokens)))
;; 			   (dep-words 0 (if (null word) dep-words
;; 					  (if (< (second word) .05) (1+ dep-words) dep-words))))
;; 			  ((null word) (values indep-words dep-words indep-tokens dep-tokens))))