;;; -*- Package: USER; Mode: LISP; Syntax: Common-lisp -*- (in-package "USER") ;;; ***************** ;;; Reload this file. ;;; ***************** (defparameter *workfile* "~/lisp/huffman") (defmacro L (&optional (filename *workfile*)) `(load ,filename)) (defmacro CL (&optional (filename *workfile*)) `(progn (compile-file ,filename) (load ,filename))) ;;; ************** ;;; File-handling. ;;; ************** ;;; ** I don't like putting this section first, but it contains macros. ** ;;; We have to do a lot of fiddly file-handling ourselves, in order to ;;; get the right conventions for EOF. It's a nuisance to find out ;;; whether we're at EOF, because we have to peek at the next number before ;;; we read it, to see whether it's negative. ;; mystream structure: ;; ;; Similar to a Common Lisp input stream, but with these special features: ;; The data must be numbers. ;; EOF can (but need not) be marked by a negative number. ;; We can either read from a file or from a Lisp list, as we like. ;; After EOF, read operations will return 0. ;; Use mystream-eof? to test for end of file, and mystream-read to read a datum. ;; Gets the next datum from a file or list, or -1 if we ;; hit the physical end. (So if the end isn't signaled by a ;; negative number, for some reason, we still pretend that it is.) (defmacro read-underlying-stream (underlying-stream) `(if (listp ,underlying-stream) (if (null ,underlying-stream) -1 (pop ,underlying-stream)) (read ,underlying-stream nil -1))) ;; returns -1 if we hit physical EOF (defstruct mystream underlying-stream ;; either a real stream, or a list of data objects to be "read" (next-datum (read-underlying-stream underlying-stream)) ;; if eof? is false, this holds next datum to return (eof? (minusp next-datum))) ;; are we at eof? (Initialization value depends on init val of next-datum.) ;; Returns the next number from the mystream. If there are ;; no more, returns 0. (defun mystream-read (mystream) (if (mystream-eof? mystream) 0 (let* ((datum (mystream-next-datum mystream)) (next-datum (read-underlying-stream (mystream-underlying-stream mystream)))) (if (minusp next-datum) (setf (mystream-eof? mystream) t) (setf (mystream-next-datum mystream) next-datum)) datum))) ;; Macro similar to with-open-file. The variable streamname is bound ;; to a freshly created mystream during execution of the body. (defmacro with-open-mystream (streamname filename-or-list &rest body) `(if (listp ,filename-or-list) (progn (let ((,streamname (make-mystream :underlying-stream ,filename-or-list))) ,@body)) (with-open-file (underlying-stream ,filename-or-list :direction :input) (let ((,streamname (make-mystream :underlying-stream underlying-stream))) ,@body)))) ;;; ************* ;;; Dictionaries. ;;; ************* ;; A node in the Huffman code tree. It dominates some set of ;; leaves, corresponding to a set S of source words. We think of this ;; tree node as a code dictionary for S. ;; ;; What happens in the Huffman algorithm is that we've built code ;; dictionaries for two disjoint sets of words, S0 and S1. We ;; merge those dictionaries into a dictionary for S0 union S1, ;; by prefixing 0 to the codes for S0 and 1 to the codes for S1. ;; ;; All dictionary nodes are stored in a codearray, which has just ;; enough elements to hold all the nodes: 2(m^n) - 1, where m^n is the ;; number of source words. (defstruct dict word ; the source word represented by this dictionary, if it's a leaf. ; We store this as a single number. (count 0) ; number of times a word from S appeared in the text (bitcount 0) ; This field is here for convenience -- it's a waste of storage. ; bitcount/count gives the average codelength for this dictionary. 0-child ; dictionary for the subset of S whose codes start with 0 (or nil) 1-child ; dictionary for the subset of S whose codes start with 1 (or nil) parent ; unique dictionary having this one as a child ) ;;; *************************** ;;; Reading segmented messages. ;;; *************************** ;; Reads an n-digit, base-m number from the mystream. ;; (We interpret this as a segment of length n, ;; whose elements are taken from the source alphabet ;; 0, 1, ... m-1.) ;; ;; Returns NIL if we're at end-of-file. ;; ;; Note: If we're not at end-of-file, but don't have fully ;; n digits on the mystream, we pretend the other digits ;; are 0. (So we should store the original filesize ;; with the compressed version -- that lets us trim ;; the blanks upon uncompression.) (defun read-word (mystream m n) (unless (mystream-eof? mystream) (loop with sum = 0 repeat n for digit = (mystream-read mystream) do (setf sum (* sum m)) (incf sum digit) finally (return sum)))) ;; Reads words from a mystream, and updates the counts in the code ;; tree. Returns the total number of words read. ;; ;; m is the size of the source alphabet; n is the number of ;; letters in each source word (our segmentation length). (defun count-words (mystream codearray m &optional (n 1)) (loop for word = (read-word mystream m n) while word do (incf (dict-count (aref codearray word))) count T)) ;; Given a number representing a source word, decodes it ;; into a list of its component letters. (defun unsegment-word (word m n) (loop with answer with letter repeat n do (multiple-value-setq (word letter) (floor word m)) (push letter answer) finally (return answer))) ;; Just like count-words, except that it doesn't get the ;; data from a stream. It PRETENDS to have read the data from ;; a source with the same letter probabilities as provided ;; in unsegmented-codearray, and with Bernoulli words of length n. ;; If the original source was a sequence of k single-letter messages, the ;; notional Bernoulli source is a sequence of k^n n-letter messages. (defun count-Bernoulli-words (unsegmented-codearray codearray m n) (loop for word from 0 to (1- (expt m n)) for letters = (unsegment-word word m n) for letter-counts = (loop for letter in letters collect (dict-count (aref unsegmented-codearray letter))) for word-count = (apply #'* letter-counts) do (setf (dict-count (aref codearray word)) word-count) sum word-count))) ;;; ************************** ;;; Building the Huffman code. ;;; ************************** ;; Creates an array to hold the code tree. m^n is the number of ;; possible source words. These will become the leaves of a binary ;; tree, with 2*(m^n) - 1 nodes total. ;; ;; We place an empty dictionary at each of the first m^n nodes (leaves), ;; representing a degenerate code that translates the single sourceword ;; for that node into a codeword of 0 length. These dictionaries ;; start out with 0 counts; count-words will fill them in. ;; ;; The codearray keeps track of its first "unused" element with a ;; so-called fill pointer. (This is a Common Lisp feature.) ;; Only the first m^n nodes are used at first. When new nodes are added ;; to the code tree, we will put them at the "end" of the array -- ;; i.e., wherever the fill pointer has advanced to. (defun make-codearray (m n) (loop with m^n = (expt m n) with array-size = (1- (* 2 m^n)) with codearray = (make-array array-size :fill-pointer m^n) for i from 0 to (length codearray) ;; respects fill pointer do (setf (aref codearray i) (make-dict :word i)) finally (return codearray))) ;; Finds the two lowest-probability nodes. ;; Returns them in a two-element list (the larger first). ;; ;; WARNING: WE USE SIMPLE LINEAR SEARCH; IT WOULD BE MUCH ;; FASTER TO USE A BINARY HEAP. ;; ;; Note: We assume that the text has length less ;; than a googol. This assumption could be avoided in ;; at least two ways, but it's pretty safe. (Common ;; Lisp has a great talent for representing bignums, but ;; has no representation for infinity.) (defun smallest-two (codearray) (loop with mincount1 = (expt 10 100) ; smallest count seen with mincount2 = (expt 10 100) ; second smallest count seen with dict1 ; dictionary whose count is mincount1 with dict2 ; dictionary whose count is mincount2 for i from 0 to (1- (length codearray)) ; respects fill pointer for dict = (aref codearray i) ; every dictionary in the "used" part of codearray ;; If this dictionary hasn't already been merged, ;; and has a particularly small count, when (and (null (dict-parent dict)) (< (dict-count dict) mincount2)) ;; then let it displace mincount2, and swap ;; mincount1 and mincount2 if necessary. ;; (Could be slightly more efficient.) do (setf mincount2 (dict-count dict) dict2 dict) (when (< mincount2 mincount1) (psetf mincount1 mincount2 ;; parallel assignment -- swaps 'em mincount2 mincount1 dict1 dict2 dict2 dict1)) finally (return (list dict2 dict1)))) ;; Merges two old nodes into a new one. ;; Modifies the codearray, and returns nothing. (defun merge-dicts (dict0 dict1 codearray) (let* ((count (+ (dict-count dict0) (dict-count dict1))) (bitcount (+ count ; each instance now has to bear an extra bit (0 or 1) (dict-bitcount dict0) (dict-bitcount dict1))) (new-dict (make-dict :count count :bitcount bitcount :0-child dict0 :1-child dict1))) (setf (dict-parent dict0) new-dict (dict-parent dict1) new-dict) (vector-push new-dict codearray) (values))) ; return nothing ;; This is the main routine. It creates a codearray ;; ready to hold m^n source words (of length n, from ;; the alphabet 0,1,...m-1). Then it counts all the words ;; from mystream. Finally, it merges the least probable ;; dictionaries, and repeats until it has built a complete ;; binary tree of size 2*(m^n) - 1. (defun build-code (mystream m n) (loop with codearray = (make-codearray m n) with final-treesize = (array-dimension codearray 0) with total-words = (count-words mystream codearray m n) initially (format t "~&(Finished reading ~A messages.)" total-words) while (< (length codearray) final-treesize) for (dict0 dict1) = (smallest-two codearray) do (merge-dicts dict0 dict1 codearray) finally (return codearray))) ;; Builds a code, taking the letters of the source text from the file ;; or list specified. m is the size of the source alphabet; n is the ;; desired segment length. (defun build-code-from-file (filename-or-list m n) (with-open-mystream mystream filename-or-list (build-code mystream m n))) ;; Just like build-code, but it counts the words by ;; calling count-Bernoulli-words on unsegmented-codearray. ;; See count-Bernoulli-words for details. (defun build-Bernoulli-code (unsegmented-codearray m n) (loop with codearray = (make-codearray m n) with final-treesize = (array-dimension codearray 0) with total-words = (count-Bernoulli-words unsegmented-codearray codearray m n) initially (format t "~&(Finished pretending to read ~A messages.)" total-words) while (< (length codearray) final-treesize) for (dict0 dict1) = (smallest-two codearray) do (merge-dicts dict0 dict1 codearray) finally (return codearray))) ;;; ******************************************** ;;; Applying the Huffman code to arbitrary text. ;;; ******************************************** ;; Maps source words to codewords. ;; ;; To look up a codeword, give this routine ;; the leaf dictionary corresponding to the source word. ;; The codeword is returned as a list of bits. (defun encode (dict &optional finalbits &aux (parent (dict-parent dict))) (cond ((null parent) finalbits) ((eq dict (dict-0-child parent)) (encode parent (cons 0 finalbits))) (t (encode parent (cons 1 finalbits))))) ;; Given a list of words, each one represented as a ;; single integer, encodes them into a long list ;; of bits. (defun encode-words (codearray words) (loop for word in words nconc (encode (aref codearray word)))) ;; Reads letters from the file or list specified, ;; and encodes them with the code provided. (defun encode-letters (codearray filename-or-list m n) (with-open-mystream mystream filename-or-list (loop for word = (read-word mystream m n) while word nconc (encode (aref codearray word))))) ;; Maps codewords into sourcewords. ;; ;; Given a bit string (as a list) and a dictionary, decodes ;; the first codeword on the bit string. ;; ;; Returns a list whose first element is the source word, and ;; whose second element is what's left of the bit string. (defun decode (dict bitstring) (cond ((dict-word dict) ;; if this is a leaf (list (dict-word dict) bitstring)) ((= 0 (first bitstring)) (decode (dict-0-child dict) (rest bitstring))) ((= 1 (first bitstring)) (decode (dict-1-child dict) (rest bitstring))))) ;; Given a list of bits, decodes into a list of words. (defun decode-into-words (codearray bitstring) (loop with main-dict = (aref codearray (1- (length codearray))) ;; the root dictionary was added last initially (setf bits bitstring) while bits for (word bits) = (decode main-dict bits) collect word)) ;; Given a list of bits, decodes into a list of words. (defun decode-into-letters (codearray bitstring m n) (loop for word in (decode-into-words codearray bitstring) nconc (unsegment-word word m n))) ;;; ********************* ;;; Printing our results. ;;; ********************* ;; Describes a code. If the :brief? keyword argument ;; has a non-nil value, then prints only summary stats ;; (not the entire dictionary). (defun describe-code (codearray &key (brief? nil)) (let* ((num-words (* 1/2 (1+ (length codearray)))) (main-dict (aref codearray (1- (length codearray)))) (total-count (dict-count main-dict)) (total-bitcount (dict-bitcount main-dict)) (codeword-length (float (/ total-bitcount total-count))) (entropy 0)) (loop for i from 0 to (1- num-words) for dict = (aref codearray i) for prob = (float (/ (dict-count dict) total-count)) for code = (encode dict) unless brief? do (format t "~&Word ~4D Sample prob ~,5F ~A" i prob code) unless (zerop prob) do (incf entropy (* -1 prob (log prob 2)))) (unless brief? (format t "~&~%Sorted by probability:~&~%") (loop with sorted-words = (sort (subseq codearray 0 num-words) #'> :key #'dict-count) for i from 0 to (1- num-words) for dict = (aref sorted-words i) for word = (dict-word dict) for prob = (float (/ (dict-count dict) total-count)) for code = (encode dict) do (format t "~&Message ~4D Sample prob ~,5F ~A" word prob code))) (format t "~&~%Sample text consisted of ~A messages." total-count) (format t "~&Text would be encoded into ~A bits (average codeword length ~,5F)." total-bitcount codeword-length) (format t "~&Source entropy: ~,5F." entropy) (values))) ; return nothing ;; Given a Huffman codearray, finds the average codeword length for a ;; Shannon-Fano code. ;; ;; (Doesn't construct the Shannon-Fano code -- just looks at the probabilities ;; and imagines a code for the same message source.) (defun Shannon-Fano (codearray) (let* ((num-words (* 1/2 (1+ (length codearray)))) (main-dict (aref codearray (1- (length codearray)))) (total-count (dict-count main-dict))) (loop for i from 0 to (1- num-words) for dict = (aref codearray i) for prob = (float (/ (dict-count dict) total-count)) for codeword-length = (ceiling (- (log prob 2))) sum (* prob codeword-length)))) ;;; ************************************************ ;;; Convert between ABC... and 123..., just for fun. ;;; ************************************************ (defun stringify (numlist) (coerce (loop with alphabet = " abcdefghijklmnopqrstuvwxyz" for num in numlist collect (char alphabet num)) 'string)) (defun numerify (string) (loop with alphabet = " abcdefghijklmnopqrstuvwxyz" for i from 0 to (1- (length string)) for char = (char string i) collect (position char alphabet)))