Repository: thavelick/summarize Branch: master Commit: 428f55b79917 Files: 7 Total size: 5.7 KB Directory structure: gitextract_ut6_9l2r/ ├── .gitignore ├── LICENSE.TXT ├── README.md ├── run-tests.py ├── setup.py ├── summarize.py └── test/ └── summarize.doctest ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.py[co] # Packages *.egg *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox #Translations *.mo #Mr Developer .mr.developer.cfg ================================================ FILE: LICENSE.TXT ================================================ Copyright (C) 2010-2012 Tristan Havelick Licensed under the Apache License, Version 2.0 (the 'License'); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ summarize ========= A python library for simple text summarization Installation ============ First install nltk and numpy: sudo pip install nltk sudo pip install numpy Then install the punkt and stopwords nltk packages: sudo python -m nltk.downloader -d /usr/share/nltk_data punkt sudo python -m nltk.downloader -d /usr/share/nltk_data stopwords Then, run the tests: python run-tests.py If nothing is output, you're good to go! Examples ======== See `test/summarize.doctest` for a few simple usage examples ================================================ FILE: run-tests.py ================================================ import doctest from glob import glob import os.path paths = glob('test/*.doctest') for file in paths: doctest.testfile(file) ================================================ FILE: setup.py ================================================ #!/usr/bin/env python # http://docs.python.org/2/distutils/setupscript.html from distutils.core import setup setup( name='summarize', version='20121029', # ... py_modules=['summarize'], # ... ) ================================================ FILE: summarize.py ================================================ # Simple Summarizer # Copyright (C) 2010-2012 Tristan Havelick # Author: Tristan Havelick # URL: # For license information, see LICENSE.TXT """ A summarizer based on the algorithm found in Classifier4J by Nick Lothan. In order to summarize a document this algorithm first determines the frequencies of the words in the document. It then splits the document into a series of sentences. Then it creates a summary by including the first sentence that includes each of the most frequent words. Finally summary's sentences are reordered to reflect that of those in the original document. """ ##////////////////////////////////////////////////////// ## Simple Summarizer ##////////////////////////////////////////////////////// from nltk.probability import FreqDist from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import nltk.data class SimpleSummarizer: def reorder_sentences( self, output_sentences, input ): output_sentences.sort( lambda s1, s2: input.find(s1) - input.find(s2) ) return output_sentences def get_summarized(self, input, num_sentences ): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input base_words = [word.lower() for word in tokenizer.tokenize(input)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] for word in most_frequent_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order return self.reorder_sentences(output_sentences, input) def summarize(self, input, num_sentences): return " ".join(self.get_summarized(input, num_sentences)) ================================================ FILE: test/summarize.doctest ================================================ .. Copyright (C) 2010-2012 Tristan Havelick. .. For license information, see LICENSE.TXT =========== Summarizers =========== Overview ~~~~~~~~ Summarizers provide a short summary or abstract from a longer document. >>> import summarize SimpleSummarizer ~~~~~~~~~~~~~~~~ A SimpleSummarizer makes a summary by using sentences with the most frequent words >>> ss = summarize.SimpleSummarizer() >>> input = "NLTK is a python library for working human-written text. Summarize is a package that uses NLTK to create summaries." >>> ss.summarize(input, 1) 'NLTK is a python library for working human-written text.' You can specify any number of sentenecs in the summary as you like. >>> input = "NLTK is a python library for working human-written text. Summarize is a package that uses NLTK to create summaries. A Summariser is really cool. I don't think there are any other python summarisers." >>> ss.summarize(input, 2) "NLTK is a python library for working human-written text. I don't think there are any other python summarisers." Unlike the original algorithm from Classifier4J, this summarizer works correctly with punctuation other than periods: >>> input = "NLTK is a python library for working human-written text! Summarize is a package that uses NLTK to create summaries." >>> ss.summarize(input, 1) 'NLTK is a python library for working human-written text!'