Repository: codebox/markov-text Branch: master Commit: f753fa6e19d6 Files: 14 Total size: 20.8 KB Directory structure: gitextract_evctftok/ ├── .gitignore ├── LICENSE ├── README.md ├── db.py ├── gen.py ├── markov.py ├── parse.py ├── rnd.py ├── sql.py └── test/ ├── db_test.py ├── gen_test.py ├── parse_test.py ├── sql_test.py └── suite.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc *.db *.txt .DS_Store ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2015 Rob Dawson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # markov-text This is a Python implementation of a Markov Text Generator. A [Markov Text Generator](http://en.wikipedia.org/wiki/Markov_chain) can be used to randomly generate (somewhat) realistic sentences, using words from a source text. Words are joined together in sequence, with each new word being selected based on how often it follows the previous word in the source document. The results are often just nonsense, but at times can be strangely poetic - the sentences below were generated from the text of The Hitchhikers Guide to the Galaxy: > Bits of perpetual unchangingness. > So long waves of matter, strained, twisted sharply. > So they are going to undulate across him and the species. > The barman reeled for every particle of Gold streaked through her eye. > We've met, haven't they? Look, said Ford never have good time you are merely a receipt. > The silence was delighted. ### Parsing

To use the utility, first find a source document (the larger the better) and save it as a UTF-8 encoded text file. Executing the utility in 'parse' mode, as shown, will create a .db file containing information about how frequently words follow other words in the text file.

python markov.py parse <name> <depth> <file>

* The `name` argument can be any non-empty value - this is just the name you have chosen for the source document * The `depth` argument is a numeric value (minimum 2) which determines how many of the previous words are used to select the next word. Normally a depth of 2 is used, meaning that each word is selected based only on the previous one. The larger the depth value, the more similar the generated sentences will be to those appearing in the source text. Beyond a certain depth the generated sentences will be identical to those appearing in the source. * The `file` argument indicates the location of the source text file For example:

python markov.py parse hitchhikers_guide 2 /path/to/hitchhikers.txt

The parsing process may take a while to complete, depending on the size of the input document.

### Generating

To generate new sentences, run the utility in 'generate' mode, using the name specified during the parse operation

python markov.py gen <name> <count>

* The `name` argument should match the name used with the earlier `parse` command * The `count` argument is a numeric value indicating how many sentences to generate For example:

>python markov.py gen hitchhikers_guide 3
Look, I can't speak Vogon! You don't need to touch the water
He frowned, then smiled, then tried to gauge the speed at which they were able to pick up hitch hikers
The hatchway sealed itself tight, and all the streets around it

================================================ FILE: db.py ================================================ class Db: DEPTH_PARAM_NAME = 'depth' def __init__(self, conn, sql): self.conn = conn self.cursor = conn.cursor() self.sql = sql self.depth = None def setup(self, depth): self.depth = depth self.cursor.execute(self.sql.create_word_table_sql(depth)) self.cursor.execute(self.sql.create_index_sql(depth)) self.cursor.execute(self.sql.create_param_table_sql()) self.cursor.execute(self.sql.set_param_sql(), (self.DEPTH_PARAM_NAME, depth)) def _get_word_list_count(self, word_list): if len(word_list) != self.get_depth(): raise ValueError('Expected %s words in list but found %s' % (self.get_depth(), len(word_list))) self.cursor.execute(self.sql.select_count_for_words_sql(self.get_depth()), word_list) r = self.cursor.fetchone() if r: return r[0] else: return 0 def get_depth(self): if self.depth == None: self.cursor.execute(self.sql.get_param_sql(), (self.DEPTH_PARAM_NAME,)) r = self.cursor.fetchone() if r: self.depth = int(r[0]) else: raise ValueError('No depth value found in database, db does not seem to have been created by this utility') return self.depth def add_word(self, word_list): count = self._get_word_list_count(word_list) if count: self.cursor.execute(self.sql.update_count_for_words_sql(self.get_depth()), [count + 1] + word_list) else: self.cursor.execute(self.sql.insert_row_for_words_sql(self.get_depth()), word_list + [1]) def commit(self): self.conn.commit() def get_word_count(self, word_list): counts = {} sql = self.sql.select_words_and_counts_sql(self.get_depth()) for row in self.cursor.execute(sql, word_list): counts[row[0]] = row[1] return counts ================================================ FILE: gen.py ================================================ from parse import Parser class Generator: def __init__(self, name, db, rnd): self.name = name self.db = db self.rnd = rnd def _get_next_word(self, word_list): candidate_words = self.db.get_word_count(word_list) total_next_words = sum(candidate_words.values()) i = self.rnd.randint(total_next_words) t=0 for w in candidate_words.keys(): t += candidate_words[w] if (i <= t): return w assert False def generate(self, word_separator): depth = self.db.get_depth() sentence = [Parser.SENTENCE_START_SYMBOL] * (depth - 1) end_symbol = [Parser.SENTENCE_END_SYMBOL] * (depth - 1) while True: tail = sentence[(-depth+1):] if tail == end_symbol: break word = self._get_next_word(tail) sentence.append(word) return word_separator.join(sentence[depth-1:][:1-depth]) ================================================ FILE: markov.py ================================================ from db import Db from gen import Generator from parse import Parser from sql import Sql from rnd import Rnd import sys import sqlite3 import codecs SENTENCE_SEPARATOR = '.' WORD_SEPARATOR = ' ' if __name__ == '__main__': args = sys.argv usage = 'Usage: %s (parse |gen )' % (args[0], ) if (len(args) < 3): raise ValueError(usage) mode = args[1] name = args[2] if mode == 'parse': if (len(args) != 5): raise ValueError(usage) depth = int(args[3]) file_name = args[4] db = Db(sqlite3.connect(name + '.db'), Sql()) db.setup(depth) txt = codecs.open(file_name, 'r', 'utf-8').read() Parser(name, db, SENTENCE_SEPARATOR, WORD_SEPARATOR).parse(txt) elif mode == 'gen': count = int(args[3]) db = Db(sqlite3.connect(name + '.db'), Sql()) generator = Generator(name, db, Rnd()) for i in range(0, count): print(generator.generate(WORD_SEPARATOR)) else: raise ValueError(usage) ================================================ FILE: parse.py ================================================ import sys import re class Parser: SENTENCE_START_SYMBOL = '^' SENTENCE_END_SYMBOL = '$' def __init__(self, name, db, sentence_split_char = '\n', word_split_char = ''): self.name = name self.db = db self.sentence_split_char = sentence_split_char self.word_split_char = word_split_char self.whitespace_regex = re.compile('\s+') def parse(self, txt): depth = self.db.get_depth() sentences = txt.split(self.sentence_split_char) i = 0 for sentence in sentences: sentence = self.whitespace_regex.sub(" ", sentence).strip() list_of_words = None if self.word_split_char: list_of_words = sentence.split(self.word_split_char) else: list_of_words = list(sentence.lower()) words = [Parser.SENTENCE_START_SYMBOL] * (depth - 1) + list_of_words + [Parser.SENTENCE_END_SYMBOL] * (depth - 1) for n in range(0, len(words) - depth + 1): self.db.add_word(words[n:n+depth]) i += 1 if i % 1000 == 0: print(i) sys.stdout.flush() self.db.commit() ================================================ FILE: rnd.py ================================================ from random import randint class Rnd: def randint(self, maxint): return randint(1, maxint) ================================================ FILE: sql.py ================================================ class Sql: WORD_COL_NAME_PREFIX = 'word' COUNT_COL_NAME = 'count' WORD_TABLE_NAME = 'word' INDEX_NAME = 'i_word' PARAM_TABLE_NAME = 'param' KEY_COL_NAME = 'name' VAL_COL_NAME = 'value' def _check_column_count(self, count): if count < 2: raise ValueError('Invalid column_count value, must be >= 2') def _make_column_name_list(self, column_count): return ', '.join([self.WORD_COL_NAME_PREFIX + str(n) for n in range(1, column_count + 1)]) def _make_column_names_and_placeholders(self, column_count): return ' AND '.join(['%s%s=?' % (self.WORD_COL_NAME_PREFIX, n) for n in range(1, column_count + 1)]) def create_word_table_sql(self, column_count): return 'CREATE TABLE IF NOT EXISTS %s (%s, %s)' % (self.WORD_TABLE_NAME, self._make_column_name_list(column_count), self.COUNT_COL_NAME) def create_param_table_sql(self): return 'CREATE TABLE IF NOT EXISTS %s (%s, %s)' % (self.PARAM_TABLE_NAME, self.KEY_COL_NAME, self.VAL_COL_NAME) def set_param_sql(self): return 'INSERT INTO %s (%s, %s) VALUES (?, ?)' % (self.PARAM_TABLE_NAME, self.KEY_COL_NAME, self.VAL_COL_NAME) def get_param_sql(self): return 'SELECT %s FROM %s WHERE %s=?' % (self.VAL_COL_NAME, self.PARAM_TABLE_NAME, self.KEY_COL_NAME) def create_index_sql(self, column_count): return 'CREATE INDEX IF NOT EXISTS %s ON %s (%s)' % (self.INDEX_NAME, self.WORD_TABLE_NAME, self._make_column_name_list(column_count)) def select_count_for_words_sql(self, column_count): return 'SELECT %s FROM %s WHERE %s' % (self.COUNT_COL_NAME, self.WORD_TABLE_NAME, self._make_column_names_and_placeholders(column_count)) def update_count_for_words_sql(self, column_count): return 'UPDATE %s SET %s=? WHERE %s' % (self.WORD_TABLE_NAME, self.COUNT_COL_NAME, self._make_column_names_and_placeholders(column_count)) def insert_row_for_words_sql(self, column_count): columns = self._make_column_name_list(column_count) + ', ' + self.COUNT_COL_NAME values = ', '.join(['?'] * (column_count + 1)) return 'INSERT INTO %s (%s) VALUES (%s)' % (self.WORD_TABLE_NAME, columns, values) def select_words_and_counts_sql(self, column_count): last_word_col_name = self.WORD_COL_NAME_PREFIX + str(column_count) return 'SELECT %s, %s FROM %s WHERE %s' % (last_word_col_name, self.COUNT_COL_NAME, self.WORD_TABLE_NAME, self._make_column_names_and_placeholders(column_count - 1)) def delete_words_sql(self): return 'DELETE FROM ' + self.WORD_TABLE_NAME ================================================ FILE: test/db_test.py ================================================ import unittest from db import Db class DbTest(unittest.TestCase): def setUp(self): self.conn = StubConn() self.sql = StubSql() def test_correct_sql_run_when_setup_called(self): Db(self.conn, self.sql).setup(3) execute_args = self.conn.stub_cursor.execute_args self.assertEqual(len(execute_args), 4) self.assertEqual(execute_args[0], ('create_word_table_sql 3',)) self.assertEqual(execute_args[1], ('create_index_sql 3',)) self.assertEqual(execute_args[2], ('create_param_table_sql',)) self.assertEqual(execute_args[3], ('set_param_sql', ('depth', 3))) def test_error_when_add_word_count_wrong(self): db = Db(self.conn, self.sql) db.setup(3) self.assertRaises(ValueError, db.add_word, ['one','two']) def test_insert_row_when_add_new_word_list(self): db = Db(self.conn, self.sql) db.setup(3) word_list = ['one', 'two', 'three'] db.add_word(word_list) execute_args = self.conn.stub_cursor.execute_args self.assertEqual(len(execute_args), 6) self.assertEqual(execute_args[4], ('select_count_for_words_sql 3', word_list)) self.assertEqual(execute_args[5], ('insert_row_for_words_sql 3', word_list + [1])) def test_update_row_when_add_repeated_word_list(self): db = Db(self.conn, self.sql) db.setup(3) row_count = 10 word_list = ['one', 'two', 'three'] self.conn.stub_cursor.fetchone_results.append([row_count]) db.add_word(word_list) execute_args = self.conn.stub_cursor.execute_args self.assertEqual(len(execute_args), 6) self.assertEqual(execute_args[4], ('select_count_for_words_sql 3', word_list)) self.assertEqual(execute_args[5], ('update_count_for_words_sql 3', [row_count + 1] + word_list)) def test_db_commit_performed_correctly(self): db = Db(self.conn, self.sql) db.setup(3) self.assertEqual(self.conn.commit_count, 0) db.commit() self.assertEqual(self.conn.commit_count, 1) def test_get_word_counts_works_correctly(self): db = Db(self.conn, self.sql) db.setup(3) word_list = ['i', 'like'] self.conn.stub_cursor.execute_results = [[['dogs', 1], ['cats', 2], ['frogs', 3]]] word_counts = db.get_word_count(word_list) self.assertEqual(word_counts, {'dogs' : 1, 'cats' : 2, 'frogs' : 3}) execute_args = self.conn.stub_cursor.execute_args self.assertEqual(len(execute_args), 5) self.assertEqual(execute_args[4], ('select_words_and_counts_sql 3', word_list)) class StubCursor: def __init__(self): self.execute_results = [] self.execute_args = [] self.fetchone_results = [] self.fetchone_count = 0 def fetchone(self): self.fetchone_count += 1 if len(self.fetchone_results): return self.fetchone_results.pop(0) return None def execute(self, *args): self.execute_args.append(args) if len(self.execute_results): return self.execute_results.pop(0) return None def get_execute_count(self): return self.execute_count class StubConn: def __init__(self): self.commit_count = 0 self.stub_cursor = StubCursor() def commit(self): self.commit_count += 1 def cursor(self): return self.stub_cursor class StubSql: def create_word_table_sql(self, column_count): return 'create_word_table_sql' + ' ' + str(column_count) def create_index_sql(self, column_count): return 'create_index_sql' + ' ' + str(column_count) def create_param_table_sql(self): return 'create_param_table_sql' def set_param_sql(self): return 'set_param_sql' def get_param_sql(self): return 'get_param_sql' def select_count_for_words_sql(self, column_count): return 'select_count_for_words_sql' + ' ' + str(column_count) def update_count_for_words_sql(self, column_count): return 'update_count_for_words_sql' + ' ' + str(column_count) def insert_row_for_words_sql(self, column_count): return 'insert_row_for_words_sql' + ' ' + str(column_count) def select_words_and_counts_sql(self, column_count): return 'select_words_and_counts_sql' + ' ' + str(column_count) def delete_words_sql(self): return 'delete_words_sql' if __name__ == '__main__': unittest.main() ================================================ FILE: test/gen_test.py ================================================ import unittest from collections import OrderedDict from gen import Generator class GenTest(unittest.TestCase): def setUp(self): self.db = StubDb() self.rnd = StubRnd() def test_generated_sequence_is_correct(self): self.db.count_values = [ OrderedDict([('the', 2), ('a', 1)]), OrderedDict([('mat', 1), ('cat', 1)]), OrderedDict([('sat', 2)]), OrderedDict([('on', 1), ('under' , 4)]), OrderedDict([('my', 2), ('the', 2)]), OrderedDict([('mat', 1), ('cat', 1)]), OrderedDict([('$', 1)])] self.rnd.vals = [1, 2, 2, 1, 4, 1, 1] self.assertEqual(Generator('name', self.db, self.rnd).generate(' '), 'the cat sat on the mat') self.assertEqual(self.db.get_word_count_args, [['^'], ['the'], ['cat'], ['sat'], ['on'], ['the'], ['mat']]) self.assertEqual(self.rnd.maxints, [3, 2, 2, 5, 4, 2, 1]) class StubDb: def __init__(self): self.count_values = [] self.get_word_count_args = [] self.depth = 2 def get_depth(self): return self.depth def get_word_count(self, word_list): self.get_word_count_args.append(word_list) return self.count_values.pop(0) class StubRnd: def __init__(self): self.vals = [] self.maxints = [] def randint(self, maxint): self.maxints.append(maxint) return self.vals.pop(0) if __name__ == '__main__': unittest.main() ================================================ FILE: test/parse_test.py ================================================ import unittest from parse import Parser class ParserTest(unittest.TestCase): def setUp(self): self.db = StubDb() def test_db_updated_correctly_from_input_with_depth_2_and_extra_whitespace(self): Parser('name', self.db, '\n', ' ').parse(' the cat sat on the mat \n good cat ') self.assertEqual(self.db.commit_count, 1) self.assertEqual(self.db.added_word_list, [['^', 'the'], ['the', 'cat'], ['cat', 'sat'], ['sat', 'on'], ['on', 'the'], ['the', 'mat'], ['mat', '$'], ['^', 'good'], ['good', 'cat'], ['cat', '$']]) def test_db_updated_correctly_from_input_with_depth_4(self): self.db.depth = 4 Parser('name', self.db, '\n', ' ').parse('the cat sat on the mat') self.assertEqual(self.db.commit_count, 1) self.assertEqual(self.db.added_word_list, [['^', '^', '^', 'the'], ['^', '^', 'the', 'cat'], ['^', 'the', 'cat', 'sat'], ['the', 'cat', 'sat', 'on'], ['cat', 'sat', 'on', 'the'], ['sat', 'on', 'the', 'mat'], ['on', 'the', 'mat', '$'], ['the', 'mat', '$', '$'], ['mat', '$', '$', '$']]) class StubDb: def __init__(self): self.commit_count = 0 self.added_word_list = [] self.depth = 2 def get_depth(self): return self.depth def add_word(self, word_list): self.added_word_list.append(word_list) def commit(self): self.commit_count += 1 if __name__ == '__main__': unittest.main() ================================================ FILE: test/sql_test.py ================================================ import unittest from sql import Sql class SqlTest(unittest.TestCase): def test_create_word_table_sql_correct(self): self.assertEqual(Sql().create_word_table_sql(3), 'CREATE TABLE IF NOT EXISTS word (word1, word2, word3, count)') def test_create_param_table_sql_correct(self): self.assertEqual(Sql().create_param_table_sql(), 'CREATE TABLE IF NOT EXISTS param (name, value)') def test_set_param_sql_correct(self): self.assertEqual(Sql().set_param_sql(), 'INSERT INTO param (name, value) VALUES (?, ?)') def test_create_index_sql_correct(self): self.assertEqual(Sql().create_index_sql(3), 'CREATE INDEX IF NOT EXISTS i_word ON word (word1, word2, word3)') def test_select_count_for_words_sql_correct(self): self.assertEqual(Sql().select_count_for_words_sql(3), 'SELECT count FROM word WHERE word1=? AND word2=? AND word3=?') def test_update_count_for_words_sql_correct(self): self.assertEqual(Sql().update_count_for_words_sql(3), 'UPDATE word SET count=? WHERE word1=? AND word2=? AND word3=?') def test_insert_row_for_words_sql_correct(self): self.assertEqual(Sql().insert_row_for_words_sql(3), 'INSERT INTO word (word1, word2, word3, count) VALUES (?, ?, ?, ?)') def test_select_words_and_counts_sql_correct(self): self.assertEqual(Sql().select_words_and_counts_sql(3), 'SELECT word3, count FROM word WHERE word1=? AND word2=?') def test_delete_words_sql_correct(self): self.assertEqual(Sql().delete_words_sql(), 'DELETE FROM word') if __name__ == '__main__': unittest.main() ================================================ FILE: test/suite.py ================================================ import unittest from db_test import DbTest from gen_test import GenTest from parse_test import ParserTest from sql_test import SqlTest def suite(): test_suite = unittest.TestSuite() test_suite.addTest(unittest.makeSuite(DbTest)) test_suite.addTest(unittest.makeSuite(GenTest)) test_suite.addTest(unittest.makeSuite(ParserTest)) test_suite.addTest(unittest.makeSuite(SqlTest)) return test_suite if __name__ == "__main__": #So you can run tests from this module individually. unittest.main()