Showing preview only (983K chars total). Download the full file or copy to clipboard to get everything.
Repository: zhangxiangxiao/glyph
Branch: master
Commit: df6ef3262156
Files: 833
Total size: 789.9 KB
Directory structure:
gitextract_d_gx1n5i/
├── LICENSE
├── README.md
├── data/
│ ├── 11st/
│ │ ├── construct_rr.py
│ │ ├── create_post.py
│ │ ├── create_review.py
│ │ ├── segment_rr_word.lua
│ │ └── segment_word.py
│ ├── README.md
│ ├── chinanews/
│ │ └── construct_topic.py
│ ├── data/
│ │ └── README.txt
│ ├── dianping/
│ │ ├── combine_gram_count.lua
│ │ ├── construct_charbag.lua
│ │ ├── construct_chargram.lua
│ │ ├── construct_chartoken.lua
│ │ ├── construct_code.lua
│ │ ├── construct_pinyin.py
│ │ ├── construct_reviews.lua
│ │ ├── construct_string.lua
│ │ ├── construct_tfidf.lua
│ │ ├── construct_word.lua
│ │ ├── construct_wordbag.lua
│ │ ├── construct_wordgram.lua
│ │ ├── construct_wordtoken.lua
│ │ ├── convert_string_code.lua
│ │ ├── count_chargram.lua
│ │ ├── count_wordgram.lua
│ │ ├── limit_code.lua
│ │ ├── limit_csvlines.sh
│ │ ├── queue.lua
│ │ ├── remove_duplication.py
│ │ ├── remove_null.sh
│ │ ├── segment_roman_word.lua
│ │ ├── segment_word.py
│ │ ├── select_data.lua
│ │ ├── shuffle_lines.sh
│ │ ├── sort_gram_count.sh
│ │ ├── sort_gram_list.sh
│ │ ├── split_lines.sh
│ │ └── split_train.lua
│ ├── ifeng/
│ │ └── construct_topic.py
│ ├── jd/
│ │ ├── count_data.lua
│ │ ├── create_comment.py
│ │ ├── limit_length.lua
│ │ └── sort_data.sh
│ ├── joint/
│ │ ├── combine_word.lua
│ │ └── combine_word_list.lua
│ ├── nytimes/
│ │ ├── construct_topic.py
│ │ └── count_class.lua
│ └── rakuten/
│ ├── construct_hepburn.py
│ ├── create_review.py
│ └── segment_word.py
├── doc/
│ └── dianping.md
├── embednet/
│ ├── archive/
│ │ ├── 11stbinary_temporal12length512feature256.sh
│ │ ├── 11stbinary_temporal12length512feature256byte.sh
│ │ ├── 11stbinary_temporal12length512feature256roman.sh
│ │ ├── 11stbinary_temporal12length512feature256romanword.sh
│ │ ├── 11stbinary_temporal12length512feature256word.sh
│ │ ├── 11stbinary_temporal8length486feature256.sh
│ │ ├── 11stbinary_temporal8length486feature256byte.sh
│ │ ├── 11stbinary_temporal8length486feature256roman.sh
│ │ ├── 11stbinary_temporal8length486feature256romanword.sh
│ │ ├── 11stbinary_temporal8length486feature256word.sh
│ │ ├── 11stfull_temporal12length512feature256.sh
│ │ ├── 11stfull_temporal12length512feature256byte.sh
│ │ ├── 11stfull_temporal12length512feature256roman.sh
│ │ ├── 11stfull_temporal12length512feature256romanword.sh
│ │ ├── 11stfull_temporal12length512feature256word.sh
│ │ ├── 11stfull_temporal8length486feature256.sh
│ │ ├── 11stfull_temporal8length486feature256byte.sh
│ │ ├── 11stfull_temporal8length486feature256roman.sh
│ │ ├── 11stfull_temporal8length486feature256romanword.sh
│ │ ├── 11stfull_temporal8length486feature256word.sh
│ │ ├── amazonbinary_temporal12length512feature256.sh
│ │ ├── amazonbinary_temporal12length512feature256word.sh
│ │ ├── amazonbinary_temporal8length486feature256.sh
│ │ ├── amazonbinary_temporal8length486feature256word.sh
│ │ ├── amazonfull_temporal12length512feature256.sh
│ │ ├── amazonfull_temporal12length512feature256word.sh
│ │ ├── amazonfull_temporal8length486feature256.sh
│ │ ├── amazonfull_temporal8length486feature256word.sh
│ │ ├── chinanews_temporal12length512feature256.sh
│ │ ├── chinanews_temporal12length512feature256byte.sh
│ │ ├── chinanews_temporal12length512feature256roman.sh
│ │ ├── chinanews_temporal12length512feature256romanword.sh
│ │ ├── chinanews_temporal12length512feature256word.sh
│ │ ├── chinanews_temporal8length486feature256.sh
│ │ ├── chinanews_temporal8length486feature256byte.sh
│ │ ├── chinanews_temporal8length486feature256roman.sh
│ │ ├── chinanews_temporal8length486feature256romanword.sh
│ │ ├── chinanews_temporal8length486feature256word.sh
│ │ ├── dianping_temporal12length512feature256.sh
│ │ ├── dianping_temporal12length512feature256byte.sh
│ │ ├── dianping_temporal12length512feature256roman.sh
│ │ ├── dianping_temporal12length512feature256romanword.sh
│ │ ├── dianping_temporal12length512feature256word.sh
│ │ ├── dianping_temporal8length486feature256.sh
│ │ ├── dianping_temporal8length486feature256byte.sh
│ │ ├── dianping_temporal8length486feature256roman.sh
│ │ ├── dianping_temporal8length486feature256romanword.sh
│ │ ├── dianping_temporal8length486feature256word.sh
│ │ ├── ifeng_temporal12length512feature256.sh
│ │ ├── ifeng_temporal12length512feature256byte.sh
│ │ ├── ifeng_temporal12length512feature256roman.sh
│ │ ├── ifeng_temporal12length512feature256romanword.sh
│ │ ├── ifeng_temporal12length512feature256word.sh
│ │ ├── ifeng_temporal8length486feature256.sh
│ │ ├── ifeng_temporal8length486feature256byte.sh
│ │ ├── ifeng_temporal8length486feature256roman.sh
│ │ ├── ifeng_temporal8length486feature256romanword.sh
│ │ ├── ifeng_temporal8length486feature256word.sh
│ │ ├── jdbinary_temporal12length512feature256.sh
│ │ ├── jdbinary_temporal12length512feature256byte.sh
│ │ ├── jdbinary_temporal12length512feature256roman.sh
│ │ ├── jdbinary_temporal12length512feature256romanword.sh
│ │ ├── jdbinary_temporal12length512feature256word.sh
│ │ ├── jdbinary_temporal8length486feature256.sh
│ │ ├── jdbinary_temporal8length486feature256byte.sh
│ │ ├── jdbinary_temporal8length486feature256roman.sh
│ │ ├── jdbinary_temporal8length486feature256romanword.sh
│ │ ├── jdbinary_temporal8length486feature256word.sh
│ │ ├── jdfull_temporal12length512feature256.sh
│ │ ├── jdfull_temporal12length512feature256byte.sh
│ │ ├── jdfull_temporal12length512feature256roman.sh
│ │ ├── jdfull_temporal12length512feature256romanword.sh
│ │ ├── jdfull_temporal12length512feature256word.sh
│ │ ├── jdfull_temporal8length486feature256.sh
│ │ ├── jdfull_temporal8length486feature256byte.sh
│ │ ├── jdfull_temporal8length486feature256roman.sh
│ │ ├── jdfull_temporal8length486feature256romanword.sh
│ │ ├── jdfull_temporal8length486feature256word.sh
│ │ ├── jointbinary_temporal12length512feature256.sh
│ │ ├── jointbinary_temporal12length512feature256byte.sh
│ │ ├── jointbinary_temporal12length512feature256roman.sh
│ │ ├── jointbinary_temporal12length512feature256romanword.sh
│ │ ├── jointbinary_temporal12length512feature256word.sh
│ │ ├── jointbinary_temporal8length486feature256.sh
│ │ ├── jointbinary_temporal8length486feature256byte.sh
│ │ ├── jointbinary_temporal8length486feature256roman.sh
│ │ ├── jointbinary_temporal8length486feature256romanword.sh
│ │ ├── jointbinary_temporal8length486feature256word.sh
│ │ ├── jointfull_temporal12length512feature256.sh
│ │ ├── jointfull_temporal12length512feature256byte.sh
│ │ ├── jointfull_temporal12length512feature256roman.sh
│ │ ├── jointfull_temporal12length512feature256romanword.sh
│ │ ├── jointfull_temporal12length512feature256word.sh
│ │ ├── jointfull_temporal8length486feature256.sh
│ │ ├── jointfull_temporal8length486feature256byte.sh
│ │ ├── jointfull_temporal8length486feature256roman.sh
│ │ ├── jointfull_temporal8length486feature256romanword.sh
│ │ ├── jointfull_temporal8length486feature256word.sh
│ │ ├── nytimes_temporal12length512feature256.sh
│ │ ├── nytimes_temporal12length512feature256word.sh
│ │ ├── nytimes_temporal8length486feature256.sh
│ │ ├── nytimes_temporal8length486feature256word.sh
│ │ ├── rakutenbinary_temporal12length512feature256.sh
│ │ ├── rakutenbinary_temporal12length512feature256byte.sh
│ │ ├── rakutenbinary_temporal12length512feature256roman.sh
│ │ ├── rakutenbinary_temporal12length512feature256romanword.sh
│ │ ├── rakutenbinary_temporal12length512feature256word.sh
│ │ ├── rakutenbinary_temporal8length486feature256.sh
│ │ ├── rakutenbinary_temporal8length486feature256byte.sh
│ │ ├── rakutenbinary_temporal8length486feature256roman.sh
│ │ ├── rakutenbinary_temporal8length486feature256romanword.sh
│ │ ├── rakutenbinary_temporal8length486feature256word.sh
│ │ ├── rakutenfull_temporal12length512feature256.sh
│ │ ├── rakutenfull_temporal12length512feature256byte.sh
│ │ ├── rakutenfull_temporal12length512feature256roman.sh
│ │ ├── rakutenfull_temporal12length512feature256romanword.sh
│ │ ├── rakutenfull_temporal12length512feature256word.sh
│ │ ├── rakutenfull_temporal8length486feature256.sh
│ │ ├── rakutenfull_temporal8length486feature256byte.sh
│ │ ├── rakutenfull_temporal8length486feature256roman.sh
│ │ ├── rakutenfull_temporal8length486feature256romanword.sh
│ │ └── rakutenfull_temporal8length486feature256word.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── unittest/
│ │ ├── data.lua
│ │ ├── driver.lua
│ │ ├── model.lua
│ │ ├── model_cudnn.lua
│ │ ├── model_cunn.lua
│ │ ├── test.lua
│ │ ├── test_cuda.lua
│ │ ├── train.lua
│ │ └── train_cuda.lua
│ └── visualizer.lua
├── fasttext/
│ └── archive/
│ ├── 11stbinary_charbigram.sh
│ ├── 11stbinary_charbigram_evaluation.sh
│ ├── 11stbinary_charbigram_tuned.sh
│ ├── 11stbinary_charpentagram.sh
│ ├── 11stbinary_charpentagram_evaluation.sh
│ ├── 11stbinary_charpentagram_tuned.sh
│ ├── 11stbinary_charunigram.sh
│ ├── 11stbinary_charunigram_evaluation.sh
│ ├── 11stbinary_charunigram_tuned.sh
│ ├── 11stbinary_wordbigram.sh
│ ├── 11stbinary_wordbigram_evaluation.sh
│ ├── 11stbinary_wordbigram_tuned.sh
│ ├── 11stbinary_wordbigramroman.sh
│ ├── 11stbinary_wordbigramroman_evaluation.sh
│ ├── 11stbinary_wordbigramroman_tuned.sh
│ ├── 11stbinary_wordpentagram.sh
│ ├── 11stbinary_wordpentagram_evaluation.sh
│ ├── 11stbinary_wordpentagram_tuned.sh
│ ├── 11stbinary_wordpentagramroman.sh
│ ├── 11stbinary_wordpentagramroman_evaluation.sh
│ ├── 11stbinary_wordpentagramroman_tuned.sh
│ ├── 11stbinary_wordunigram.sh
│ ├── 11stbinary_wordunigram_evaluation.sh
│ ├── 11stbinary_wordunigram_tuned.sh
│ ├── 11stbinary_wordunigramroman.sh
│ ├── 11stbinary_wordunigramroman_evaluation.sh
│ ├── 11stbinary_wordunigramroman_tuned.sh
│ ├── 11stfull_charbigram.sh
│ ├── 11stfull_charbigram_evaluation.sh
│ ├── 11stfull_charbigram_tuned.sh
│ ├── 11stfull_charpentagram.sh
│ ├── 11stfull_charpentagram_evaluation.sh
│ ├── 11stfull_charpentagram_tuned.sh
│ ├── 11stfull_charunigram.sh
│ ├── 11stfull_charunigram_evaluation.sh
│ ├── 11stfull_charunigram_tuned.sh
│ ├── 11stfull_wordbigram.sh
│ ├── 11stfull_wordbigram_evaluation.sh
│ ├── 11stfull_wordbigram_tuned.sh
│ ├── 11stfull_wordbigramroman.sh
│ ├── 11stfull_wordbigramroman_evaluation.sh
│ ├── 11stfull_wordbigramroman_tuned.sh
│ ├── 11stfull_wordpentagram.sh
│ ├── 11stfull_wordpentagram_evaluation.sh
│ ├── 11stfull_wordpentagram_tuned.sh
│ ├── 11stfull_wordpentagramroman.sh
│ ├── 11stfull_wordpentagramroman_evaluation.sh
│ ├── 11stfull_wordpentagramroman_tuned.sh
│ ├── 11stfull_wordunigram.sh
│ ├── 11stfull_wordunigram_evaluation.sh
│ ├── 11stfull_wordunigram_tuned.sh
│ ├── 11stfull_wordunigramroman.sh
│ ├── 11stfull_wordunigramroman_evaluation.sh
│ ├── 11stfull_wordunigramroman_tuned.sh
│ ├── amazonbinary_charbigram.sh
│ ├── amazonbinary_charbigram_evaluation.sh
│ ├── amazonbinary_charbigram_tuned.sh
│ ├── amazonbinary_charpentagram.sh
│ ├── amazonbinary_charpentagram_evaluation.sh
│ ├── amazonbinary_charpentagram_tuned.sh
│ ├── amazonbinary_charunigram.sh
│ ├── amazonbinary_charunigram_evaluation.sh
│ ├── amazonbinary_charunigram_tuned.sh
│ ├── amazonbinary_wordbigram.sh
│ ├── amazonbinary_wordbigram_evaluation.sh
│ ├── amazonbinary_wordbigram_tuned.sh
│ ├── amazonbinary_wordpentagram.sh
│ ├── amazonbinary_wordpentagram_evaluation.sh
│ ├── amazonbinary_wordpentagram_tuned.sh
│ ├── amazonbinary_wordunigram.sh
│ ├── amazonbinary_wordunigram_evaluation.sh
│ ├── amazonbinary_wordunigram_tuned.sh
│ ├── amazonfull_charbigram.sh
│ ├── amazonfull_charbigram_evaluation.sh
│ ├── amazonfull_charbigram_tuned.sh
│ ├── amazonfull_charpentagram.sh
│ ├── amazonfull_charpentagram_evaluation.sh
│ ├── amazonfull_charpentagram_tuned.sh
│ ├── amazonfull_charunigram.sh
│ ├── amazonfull_charunigram_evaluation.sh
│ ├── amazonfull_charunigram_tuned.sh
│ ├── amazonfull_wordbigram.sh
│ ├── amazonfull_wordbigram_evaluation.sh
│ ├── amazonfull_wordbigram_tuned.sh
│ ├── amazonfull_wordpentagram.sh
│ ├── amazonfull_wordpentagram_evaluation.sh
│ ├── amazonfull_wordpentagram_tuned.sh
│ ├── amazonfull_wordunigram.sh
│ ├── amazonfull_wordunigram_evaluation.sh
│ ├── amazonfull_wordunigram_tuned.sh
│ ├── chinanews_charbigram.sh
│ ├── chinanews_charbigram_evaluation.sh
│ ├── chinanews_charbigram_tuned.sh
│ ├── chinanews_charpentagram.sh
│ ├── chinanews_charpentagram_evaluation.sh
│ ├── chinanews_charpentagram_tuned.sh
│ ├── chinanews_charunigram.sh
│ ├── chinanews_charunigram_evaluation.sh
│ ├── chinanews_charunigram_tuned.sh
│ ├── chinanews_wordbigram.sh
│ ├── chinanews_wordbigram_evaluation.sh
│ ├── chinanews_wordbigram_tuned.sh
│ ├── chinanews_wordbigramroman.sh
│ ├── chinanews_wordbigramroman_evaluation.sh
│ ├── chinanews_wordbigramroman_tuned.sh
│ ├── chinanews_wordpentagram.sh
│ ├── chinanews_wordpentagram_evaluation.sh
│ ├── chinanews_wordpentagram_tuned.sh
│ ├── chinanews_wordpentagramroman.sh
│ ├── chinanews_wordpentagramroman_evaluation.sh
│ ├── chinanews_wordpentagramroman_tuned.sh
│ ├── chinanews_wordunigram.sh
│ ├── chinanews_wordunigram_evaluation.sh
│ ├── chinanews_wordunigram_tuned.sh
│ ├── chinanews_wordunigramroman.sh
│ ├── chinanews_wordunigramroman_evaluation.sh
│ ├── chinanews_wordunigramroman_tuned.sh
│ ├── dianping_charbigram.sh
│ ├── dianping_charbigram_evaluation.sh
│ ├── dianping_charbigram_tuned.sh
│ ├── dianping_charpentagram.sh
│ ├── dianping_charpentagram_evaluation.sh
│ ├── dianping_charpentagram_tuned.sh
│ ├── dianping_charunigram.sh
│ ├── dianping_charunigram_evaluation.sh
│ ├── dianping_charunigram_tuned.sh
│ ├── dianping_wordbigram.sh
│ ├── dianping_wordbigram_evaluation.sh
│ ├── dianping_wordbigram_tuned.sh
│ ├── dianping_wordbigramroman.sh
│ ├── dianping_wordbigramroman_evaluation.sh
│ ├── dianping_wordbigramroman_tuned.sh
│ ├── dianping_wordpentagram.sh
│ ├── dianping_wordpentagram_evaluation.sh
│ ├── dianping_wordpentagram_tuned.sh
│ ├── dianping_wordpentagramroman.sh
│ ├── dianping_wordpentagramroman_evaluation.sh
│ ├── dianping_wordpentagramroman_tuned.sh
│ ├── dianping_wordunigram.sh
│ ├── dianping_wordunigram_evaluation.sh
│ ├── dianping_wordunigram_tuned.sh
│ ├── dianping_wordunigramroman.sh
│ ├── dianping_wordunigramroman_evaluation.sh
│ ├── dianping_wordunigramroman_tuned.sh
│ ├── ifeng_charbigram.sh
│ ├── ifeng_charbigram_evaluation.sh
│ ├── ifeng_charbigram_tuned.sh
│ ├── ifeng_charpentagram.sh
│ ├── ifeng_charpentagram_evaluation.sh
│ ├── ifeng_charpentagram_tuned.sh
│ ├── ifeng_charunigram.sh
│ ├── ifeng_charunigram_evaluation.sh
│ ├── ifeng_charunigram_tuned.sh
│ ├── ifeng_wordbigram.sh
│ ├── ifeng_wordbigram_evaluation.sh
│ ├── ifeng_wordbigram_tuned.sh
│ ├── ifeng_wordbigramroman.sh
│ ├── ifeng_wordbigramroman_evaluation.sh
│ ├── ifeng_wordbigramroman_tuned.sh
│ ├── ifeng_wordpentagram.sh
│ ├── ifeng_wordpentagram_evaluation.sh
│ ├── ifeng_wordpentagram_tuned.sh
│ ├── ifeng_wordpentagramroman.sh
│ ├── ifeng_wordpentagramroman_evaluation.sh
│ ├── ifeng_wordpentagramroman_tuned.sh
│ ├── ifeng_wordunigram.sh
│ ├── ifeng_wordunigram_evaluation.sh
│ ├── ifeng_wordunigram_tuned.sh
│ ├── ifeng_wordunigramroman.sh
│ ├── ifeng_wordunigramroman_evaluation.sh
│ ├── ifeng_wordunigramroman_tuned.sh
│ ├── jdbinary_charbigram.sh
│ ├── jdbinary_charbigram_evaluation.sh
│ ├── jdbinary_charbigram_tuned.sh
│ ├── jdbinary_charpentagram.sh
│ ├── jdbinary_charpentagram_evaluation.sh
│ ├── jdbinary_charpentagram_tuned.sh
│ ├── jdbinary_charunigram.sh
│ ├── jdbinary_charunigram_evaluation.sh
│ ├── jdbinary_charunigram_tuned.sh
│ ├── jdbinary_wordbigram.sh
│ ├── jdbinary_wordbigram_evaluation.sh
│ ├── jdbinary_wordbigram_tuned.sh
│ ├── jdbinary_wordbigramroman.sh
│ ├── jdbinary_wordbigramroman_evaluation.sh
│ ├── jdbinary_wordbigramroman_tuned.sh
│ ├── jdbinary_wordpentagram.sh
│ ├── jdbinary_wordpentagram_evaluation.sh
│ ├── jdbinary_wordpentagram_tuned.sh
│ ├── jdbinary_wordpentagramroman.sh
│ ├── jdbinary_wordpentagramroman_evaluation.sh
│ ├── jdbinary_wordpentagramroman_tuned.sh
│ ├── jdbinary_wordunigram.sh
│ ├── jdbinary_wordunigram_evaluation.sh
│ ├── jdbinary_wordunigram_tuned.sh
│ ├── jdbinary_wordunigramroman.sh
│ ├── jdbinary_wordunigramroman_evaluation.sh
│ ├── jdbinary_wordunigramroman_tuned.sh
│ ├── jdfull_charbigram.sh
│ ├── jdfull_charbigram_evaluation.sh
│ ├── jdfull_charbigram_tuned.sh
│ ├── jdfull_charpentagram.sh
│ ├── jdfull_charpentagram_evaluation.sh
│ ├── jdfull_charpentagram_tuned.sh
│ ├── jdfull_charunigram.sh
│ ├── jdfull_charunigram_evaluation.sh
│ ├── jdfull_charunigram_tuned.sh
│ ├── jdfull_wordbigram.sh
│ ├── jdfull_wordbigram_evaluation.sh
│ ├── jdfull_wordbigram_tuned.sh
│ ├── jdfull_wordbigramroman.sh
│ ├── jdfull_wordbigramroman_evaluation.sh
│ ├── jdfull_wordbigramroman_tuned.sh
│ ├── jdfull_wordpentagram.sh
│ ├── jdfull_wordpentagram_evaluation.sh
│ ├── jdfull_wordpentagram_tuned.sh
│ ├── jdfull_wordpentagramroman.sh
│ ├── jdfull_wordpentagramroman_evaluation.sh
│ ├── jdfull_wordpentagramroman_tuned.sh
│ ├── jdfull_wordunigram.sh
│ ├── jdfull_wordunigram_evaluation.sh
│ ├── jdfull_wordunigram_tuned.sh
│ ├── jdfull_wordunigramroman.sh
│ ├── jdfull_wordunigramroman_evaluation.sh
│ ├── jdfull_wordunigramroman_tuned.sh
│ ├── jointbinary_charbigram.sh
│ ├── jointbinary_charbigram_evaluation.sh
│ ├── jointbinary_charbigram_tuned.sh
│ ├── jointbinary_charpentagram.sh
│ ├── jointbinary_charpentagram_evaluation.sh
│ ├── jointbinary_charpentagram_tuned.sh
│ ├── jointbinary_charunigram.sh
│ ├── jointbinary_charunigram_evaluation.sh
│ ├── jointbinary_charunigram_tuned.sh
│ ├── jointbinary_wordbigram.sh
│ ├── jointbinary_wordbigram_evaluation.sh
│ ├── jointbinary_wordbigram_tuned.sh
│ ├── jointbinary_wordbigramroman.sh
│ ├── jointbinary_wordbigramroman_evaluation.sh
│ ├── jointbinary_wordbigramroman_tuned.sh
│ ├── jointbinary_wordpentagram.sh
│ ├── jointbinary_wordpentagram_evaluation.sh
│ ├── jointbinary_wordpentagram_tuned.sh
│ ├── jointbinary_wordpentagramroman.sh
│ ├── jointbinary_wordpentagramroman_evaluation.sh
│ ├── jointbinary_wordpentagramroman_tuned.sh
│ ├── jointbinary_wordunigram.sh
│ ├── jointbinary_wordunigram_evaluation.sh
│ ├── jointbinary_wordunigram_tuned.sh
│ ├── jointbinary_wordunigramroman.sh
│ ├── jointbinary_wordunigramroman_evaluation.sh
│ ├── jointbinary_wordunigramroman_tuned.sh
│ ├── jointfull_charbigram.sh
│ ├── jointfull_charbigram_evaluation.sh
│ ├── jointfull_charbigram_tuned.sh
│ ├── jointfull_charpentagram.sh
│ ├── jointfull_charpentagram_evaluation.sh
│ ├── jointfull_charpentagram_tuned.sh
│ ├── jointfull_charunigram.sh
│ ├── jointfull_charunigram_evaluation.sh
│ ├── jointfull_charunigram_tuned.sh
│ ├── jointfull_wordbigram.sh
│ ├── jointfull_wordbigram_evaluation.sh
│ ├── jointfull_wordbigram_tuned.sh
│ ├── jointfull_wordbigramroman.sh
│ ├── jointfull_wordbigramroman_evaluation.sh
│ ├── jointfull_wordbigramroman_tuned.sh
│ ├── jointfull_wordpentagram.sh
│ ├── jointfull_wordpentagram_evaluation.sh
│ ├── jointfull_wordpentagram_tuned.sh
│ ├── jointfull_wordpentagramroman.sh
│ ├── jointfull_wordpentagramroman_evaluation.sh
│ ├── jointfull_wordpentagramroman_tuned.sh
│ ├── jointfull_wordunigram.sh
│ ├── jointfull_wordunigram_evaluation.sh
│ ├── jointfull_wordunigram_tuned.sh
│ ├── jointfull_wordunigramroman.sh
│ ├── jointfull_wordunigramroman_evaluation.sh
│ ├── jointfull_wordunigramroman_tuned.sh
│ ├── nytimes_charbigram.sh
│ ├── nytimes_charbigram_evaluation.sh
│ ├── nytimes_charbigram_tuned.sh
│ ├── nytimes_charpentagram.sh
│ ├── nytimes_charpentagram_evaluation.sh
│ ├── nytimes_charpentagram_tuned.sh
│ ├── nytimes_charunigram.sh
│ ├── nytimes_charunigram_evaluation.sh
│ ├── nytimes_charunigram_tuned.sh
│ ├── nytimes_wordbigram.sh
│ ├── nytimes_wordbigram_evaluation.sh
│ ├── nytimes_wordbigram_tuned.sh
│ ├── nytimes_wordpentagram.sh
│ ├── nytimes_wordpentagram_evaluation.sh
│ ├── nytimes_wordpentagram_tuned.sh
│ ├── nytimes_wordunigram.sh
│ ├── nytimes_wordunigram_evaluation.sh
│ ├── nytimes_wordunigram_tuned.sh
│ ├── rakutenbinary_charbigram.sh
│ ├── rakutenbinary_charbigram_evaluation.sh
│ ├── rakutenbinary_charbigram_tuned.sh
│ ├── rakutenbinary_charpentagram.sh
│ ├── rakutenbinary_charpentagram_evaluation.sh
│ ├── rakutenbinary_charpentagram_tuned.sh
│ ├── rakutenbinary_charunigram.sh
│ ├── rakutenbinary_charunigram_evaluation.sh
│ ├── rakutenbinary_charunigram_tuned.sh
│ ├── rakutenbinary_wordbigram.sh
│ ├── rakutenbinary_wordbigram_evaluation.sh
│ ├── rakutenbinary_wordbigram_tuned.sh
│ ├── rakutenbinary_wordbigramroman.sh
│ ├── rakutenbinary_wordbigramroman_evaluation.sh
│ ├── rakutenbinary_wordbigramroman_tuned.sh
│ ├── rakutenbinary_wordpentagram.sh
│ ├── rakutenbinary_wordpentagram_evaluation.sh
│ ├── rakutenbinary_wordpentagram_tuned.sh
│ ├── rakutenbinary_wordpentagramroman.sh
│ ├── rakutenbinary_wordpentagramroman_evaluation.sh
│ ├── rakutenbinary_wordpentagramroman_tuned.sh
│ ├── rakutenbinary_wordunigram.sh
│ ├── rakutenbinary_wordunigram_evaluation.sh
│ ├── rakutenbinary_wordunigram_tuned.sh
│ ├── rakutenbinary_wordunigramroman.sh
│ ├── rakutenbinary_wordunigramroman_evaluation.sh
│ ├── rakutenbinary_wordunigramroman_tuned.sh
│ ├── rakutenfull_charbigram.sh
│ ├── rakutenfull_charbigram_evaluation.sh
│ ├── rakutenfull_charbigram_tuned.sh
│ ├── rakutenfull_charpentagram.sh
│ ├── rakutenfull_charpentagram_evaluation.sh
│ ├── rakutenfull_charpentagram_tuned.sh
│ ├── rakutenfull_charunigram.sh
│ ├── rakutenfull_charunigram_evaluation.sh
│ ├── rakutenfull_charunigram_tuned.sh
│ ├── rakutenfull_wordbigram.sh
│ ├── rakutenfull_wordbigram_evaluation.sh
│ ├── rakutenfull_wordbigram_tuned.sh
│ ├── rakutenfull_wordbigramroman.sh
│ ├── rakutenfull_wordbigramroman_evaluation.sh
│ ├── rakutenfull_wordbigramroman_tuned.sh
│ ├── rakutenfull_wordpentagram.sh
│ ├── rakutenfull_wordpentagram_evaluation.sh
│ ├── rakutenfull_wordpentagram_tuned.sh
│ ├── rakutenfull_wordpentagramroman.sh
│ ├── rakutenfull_wordpentagramroman_evaluation.sh
│ ├── rakutenfull_wordpentagramroman_tuned.sh
│ ├── rakutenfull_wordunigram.sh
│ ├── rakutenfull_wordunigram_evaluation.sh
│ ├── rakutenfull_wordunigram_tuned.sh
│ ├── rakutenfull_wordunigramroman.sh
│ ├── rakutenfull_wordunigramroman_evaluation.sh
│ └── rakutenfull_wordunigramroman_tuned.sh
├── glyphnet/
│ ├── archive/
│ │ ├── 11stbinary_spatial6temporal8length486feature256.sh
│ │ ├── 11stbinary_spatial8temporal12length512feature256.sh
│ │ ├── 11stfull_spatial6temporal8length486feature256.sh
│ │ ├── 11stfull_spatial8temporal12length512feature256.sh
│ │ ├── amazonbinary_spatial6temporal8length486feature256.sh
│ │ ├── amazonbinary_spatial8temporal12length512feature256.sh
│ │ ├── amazonfull_spatial6temporal8length486feature256.sh
│ │ ├── amazonfull_spatial8temporal12length512feature256.sh
│ │ ├── chinanews_spatial6temporal8length486feature256.sh
│ │ ├── chinanews_spatial8temporal12length512feature256.sh
│ │ ├── dianping_spatial6temporal8length486feature256.sh
│ │ ├── dianping_spatial8temporal12length512feature256.sh
│ │ ├── ifeng_spatial6temporal8length486feature256.sh
│ │ ├── ifeng_spatial8temporal12length512feature256.sh
│ │ ├── jdbinary_spatial6temporal8length486feature256.sh
│ │ ├── jdbinary_spatial8temporal12length512feature256.sh
│ │ ├── jdfull_spatial6temporal8length486feature256.sh
│ │ ├── jdfull_spatial8temporal12length512feature256.sh
│ │ ├── jointbinary_spatial6temporal8length486feature256.sh
│ │ ├── jointbinary_spatial8temporal12length512feature256.sh
│ │ ├── jointfull_spatial6temporal8length486feature256.sh
│ │ ├── jointfull_spatial8temporal12length512feature256.sh
│ │ ├── nytimes_spatial6temporal8length486feature256.sh
│ │ ├── nytimes_spatial8temporal12length512feature256.sh
│ │ ├── rakutenbinary_spatial6temporal8length486feature256.sh
│ │ ├── rakutenbinary_spatial8temporal12length512feature256.sh
│ │ ├── rakutenfull_spatial6temporal8length486feature256.sh
│ │ └── rakutenfull_spatial8temporal12length512feature256.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── main.lua
│ ├── model.lua
│ ├── modules/
│ │ ├── TemporalConvolutionCudnn.lua
│ │ ├── TemporalConvolutionMM.lua
│ │ ├── TemporalMaxPoolingCudnn.lua
│ │ └── TemporalMaxPoolingMM.lua
│ ├── modules.lua
│ ├── scroll.lua
│ ├── scroll.ui
│ ├── test.lua
│ ├── train.lua
│ ├── unittest/
│ │ ├── data.lua
│ │ ├── driver.lua
│ │ ├── model.lua
│ │ ├── model_cuda.lua
│ │ ├── model_cudnn.lua
│ │ ├── modules_temporal.lua
│ │ ├── modules_temporal_cudnn.lua
│ │ ├── test.lua
│ │ ├── test_cuda.lua
│ │ ├── train.lua
│ │ └── train_cuda.lua
│ └── visualizer.lua
├── linearnet/
│ ├── archive/
│ │ ├── 11stbinary_charbag.sh
│ │ ├── 11stbinary_charbagtfidf.sh
│ │ ├── 11stbinary_chargram.sh
│ │ ├── 11stbinary_chargramtfidf.sh
│ │ ├── 11stbinary_wordbag.sh
│ │ ├── 11stbinary_wordbagroman.sh
│ │ ├── 11stbinary_wordbagtfidf.sh
│ │ ├── 11stbinary_wordbagtfidfroman.sh
│ │ ├── 11stbinary_wordgram.sh
│ │ ├── 11stbinary_wordgramroman.sh
│ │ ├── 11stbinary_wordgramtfidf.sh
│ │ ├── 11stbinary_wordgramtfidfroman.sh
│ │ ├── 11stfull_charbag.sh
│ │ ├── 11stfull_charbagtfidf.sh
│ │ ├── 11stfull_chargram.sh
│ │ ├── 11stfull_chargramtfidf.sh
│ │ ├── 11stfull_wordbag.sh
│ │ ├── 11stfull_wordbagroman.sh
│ │ ├── 11stfull_wordbagtfidf.sh
│ │ ├── 11stfull_wordbagtfidfroman.sh
│ │ ├── 11stfull_wordgram.sh
│ │ ├── 11stfull_wordgramroman.sh
│ │ ├── 11stfull_wordgramtfidf.sh
│ │ ├── 11stfull_wordgramtfidfroman.sh
│ │ ├── amazonbinary_charbag.sh
│ │ ├── amazonbinary_charbagtfidf.sh
│ │ ├── amazonbinary_chargram.sh
│ │ ├── amazonbinary_chargramtfidf.sh
│ │ ├── amazonbinary_wordbag.sh
│ │ ├── amazonbinary_wordbagtfidf.sh
│ │ ├── amazonbinary_wordgram.sh
│ │ ├── amazonbinary_wordgramtfidf.sh
│ │ ├── amazonfull_charbag.sh
│ │ ├── amazonfull_charbagtfidf.sh
│ │ ├── amazonfull_chargram.sh
│ │ ├── amazonfull_chargramtfidf.sh
│ │ ├── amazonfull_wordbag.sh
│ │ ├── amazonfull_wordbagtfidf.sh
│ │ ├── amazonfull_wordgram.sh
│ │ ├── amazonfull_wordgramtfidf.sh
│ │ ├── chinanews_charbag.sh
│ │ ├── chinanews_charbagtfidf.sh
│ │ ├── chinanews_chargram.sh
│ │ ├── chinanews_chargramtfidf.sh
│ │ ├── chinanews_wordbag.sh
│ │ ├── chinanews_wordbagroman.sh
│ │ ├── chinanews_wordbagtfidf.sh
│ │ ├── chinanews_wordbagtfidfroman.sh
│ │ ├── chinanews_wordgram.sh
│ │ ├── chinanews_wordgramroman.sh
│ │ ├── chinanews_wordgramtfidf.sh
│ │ ├── chinanews_wordgramtfidfroman.sh
│ │ ├── dianping_charbag.sh
│ │ ├── dianping_charbagtfidf.sh
│ │ ├── dianping_chargram.sh
│ │ ├── dianping_chargramtfidf.sh
│ │ ├── dianping_wordbag.sh
│ │ ├── dianping_wordbagroman.sh
│ │ ├── dianping_wordbagtfidf.sh
│ │ ├── dianping_wordbagtfidfroman.sh
│ │ ├── dianping_wordgram.sh
│ │ ├── dianping_wordgramroman.sh
│ │ ├── dianping_wordgramtfidf.sh
│ │ ├── dianping_wordgramtfidfroman.sh
│ │ ├── ifeng_charbag.sh
│ │ ├── ifeng_charbagtfidf.sh
│ │ ├── ifeng_chargram.sh
│ │ ├── ifeng_chargramtfidf.sh
│ │ ├── ifeng_wordbag.sh
│ │ ├── ifeng_wordbagroman.sh
│ │ ├── ifeng_wordbagtfidf.sh
│ │ ├── ifeng_wordbagtfidfroman.sh
│ │ ├── ifeng_wordgram.sh
│ │ ├── ifeng_wordgramroman.sh
│ │ ├── ifeng_wordgramtfidf.sh
│ │ ├── ifeng_wordgramtfidfroman.sh
│ │ ├── jdbinary_charbag.sh
│ │ ├── jdbinary_charbagtfidf.sh
│ │ ├── jdbinary_chargram.sh
│ │ ├── jdbinary_chargramtfidf.sh
│ │ ├── jdbinary_wordbag.sh
│ │ ├── jdbinary_wordbagroman.sh
│ │ ├── jdbinary_wordbagtfidf.sh
│ │ ├── jdbinary_wordbagtfidfroman.sh
│ │ ├── jdbinary_wordgram.sh
│ │ ├── jdbinary_wordgramroman.sh
│ │ ├── jdbinary_wordgramtfidf.sh
│ │ ├── jdbinary_wordgramtfidfroman.sh
│ │ ├── jdfull_charbag.sh
│ │ ├── jdfull_charbagtfidf.sh
│ │ ├── jdfull_chargram.sh
│ │ ├── jdfull_chargramtfidf.sh
│ │ ├── jdfull_wordbag.sh
│ │ ├── jdfull_wordbagroman.sh
│ │ ├── jdfull_wordbagtfidf.sh
│ │ ├── jdfull_wordbagtfidfroman.sh
│ │ ├── jdfull_wordgram.sh
│ │ ├── jdfull_wordgramroman.sh
│ │ ├── jdfull_wordgramtfidf.sh
│ │ ├── jdfull_wordgramtfidfroman.sh
│ │ ├── jointbinary_charbag.sh
│ │ ├── jointbinary_charbagtfidf.sh
│ │ ├── jointbinary_chargram.sh
│ │ ├── jointbinary_chargramtfidf.sh
│ │ ├── jointbinary_wordbag.sh
│ │ ├── jointbinary_wordbagroman.sh
│ │ ├── jointbinary_wordbagtfidf.sh
│ │ ├── jointbinary_wordbagtfidfroman.sh
│ │ ├── jointbinary_wordgram.sh
│ │ ├── jointbinary_wordgramroman.sh
│ │ ├── jointbinary_wordgramtfidf.sh
│ │ ├── jointbinary_wordgramtfidfroman.sh
│ │ ├── jointfull_charbag.sh
│ │ ├── jointfull_charbagtfidf.sh
│ │ ├── jointfull_chargram.sh
│ │ ├── jointfull_chargramtfidf.sh
│ │ ├── jointfull_wordbag.sh
│ │ ├── jointfull_wordbagroman.sh
│ │ ├── jointfull_wordbagtfidf.sh
│ │ ├── jointfull_wordbagtfidfroman.sh
│ │ ├── jointfull_wordgram.sh
│ │ ├── jointfull_wordgramroman.sh
│ │ ├── jointfull_wordgramtfidf.sh
│ │ ├── jointfull_wordgramtfidfroman.sh
│ │ ├── nytimes_charbag.sh
│ │ ├── nytimes_charbagtfidf.sh
│ │ ├── nytimes_chargram.sh
│ │ ├── nytimes_chargramtfidf.sh
│ │ ├── nytimes_wordbag.sh
│ │ ├── nytimes_wordbagtfidf.sh
│ │ ├── nytimes_wordgram.sh
│ │ ├── nytimes_wordgramtfidf.sh
│ │ ├── rakutenbinary_charbag.sh
│ │ ├── rakutenbinary_charbagtfidf.sh
│ │ ├── rakutenbinary_chargram.sh
│ │ ├── rakutenbinary_chargramtfidf.sh
│ │ ├── rakutenbinary_wordbag.sh
│ │ ├── rakutenbinary_wordbagroman.sh
│ │ ├── rakutenbinary_wordbagtfidf.sh
│ │ ├── rakutenbinary_wordbagtfidfroman.sh
│ │ ├── rakutenbinary_wordgram.sh
│ │ ├── rakutenbinary_wordgramroman.sh
│ │ ├── rakutenbinary_wordgramtfidf.sh
│ │ ├── rakutenbinary_wordgramtfidfroman.sh
│ │ ├── rakutenfull_charbag.sh
│ │ ├── rakutenfull_charbagtfidf.sh
│ │ ├── rakutenfull_chargram.sh
│ │ ├── rakutenfull_chargramtfidf.sh
│ │ ├── rakutenfull_wordbag.sh
│ │ ├── rakutenfull_wordbagroman.sh
│ │ ├── rakutenfull_wordbagtfidf.sh
│ │ ├── rakutenfull_wordbagtfidfroman.sh
│ │ ├── rakutenfull_wordgram.sh
│ │ ├── rakutenfull_wordgramroman.sh
│ │ ├── rakutenfull_wordgramtfidf.sh
│ │ └── rakutenfull_wordgramtfidfroman.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── queue.lua
│ ├── test.lua
│ ├── train.lua
│ └── unittest/
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── test.lua
│ └── train.lua
├── models/
│ ├── README.txt
│ ├── embednet/
│ │ └── README.txt
│ ├── fasttext/
│ │ └── README.txt
│ ├── glyphnet/
│ │ └── README.txt
│ ├── linearnet/
│ │ └── README.txt
│ └── onehotnet/
│ └── README.txt
├── onehotnet/
│ ├── archive/
│ │ ├── 11stbinary_onehot4temporal12length2048feature256.sh
│ │ ├── 11stbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── 11stbinary_onehot4temporal8length1944feature256.sh
│ │ ├── 11stbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── 11stfull_onehot4temporal12length2048feature256.sh
│ │ ├── 11stfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── 11stfull_onehot4temporal8length1944feature256.sh
│ │ ├── 11stfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── amazonbinary_onehot4temporal12length2048feature256.sh
│ │ ├── amazonbinary_onehot4temporal8length1944feature256.sh
│ │ ├── amazonfull_onehot4temporal12length2048feature256.sh
│ │ ├── amazonfull_onehot4temporal8length1944feature256.sh
│ │ ├── chinanews_onehot4temporal12length2048feature256.sh
│ │ ├── chinanews_onehot4temporal12length2048feature256roman.sh
│ │ ├── chinanews_onehot4temporal8length1944feature256.sh
│ │ ├── chinanews_onehot4temporal8length1944feature256roman.sh
│ │ ├── dianping_onehot4temporal12length2048feature256.sh
│ │ ├── dianping_onehot4temporal12length2048feature256roman.sh
│ │ ├── dianping_onehot4temporal8length1944feature256.sh
│ │ ├── dianping_onehot4temporal8length1944feature256roman.sh
│ │ ├── ifeng_onehot4temporal12length2048feature256.sh
│ │ ├── ifeng_onehot4temporal12length2048feature256roman.sh
│ │ ├── ifeng_onehot4temporal8length1944feature256.sh
│ │ ├── ifeng_onehot4temporal8length1944feature256roman.sh
│ │ ├── jdbinary_onehot4temporal12length2048feature256.sh
│ │ ├── jdbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── jdbinary_onehot4temporal8length1944feature256.sh
│ │ ├── jdbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── jdfull_onehot4temporal12length2048feature256.sh
│ │ ├── jdfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── jdfull_onehot4temporal8length1944feature256.sh
│ │ ├── jdfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── jointbinary_onehot4temporal12length2048feature256.sh
│ │ ├── jointbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── jointbinary_onehot4temporal8length1944feature256.sh
│ │ ├── jointbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── jointfull_onehot4temporal12length2048feature256.sh
│ │ ├── jointfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── jointfull_onehot4temporal8length1944feature256.sh
│ │ ├── jointfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── nytimes_onehot4temporal12length2048feature256.sh
│ │ ├── nytimes_onehot4temporal8length1944feature256.sh
│ │ ├── rakutenbinary_onehot4temporal12length2048feature256.sh
│ │ ├── rakutenbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── rakutenbinary_onehot4temporal8length1944feature256.sh
│ │ ├── rakutenbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── rakutenfull_onehot4temporal12length2048feature256.sh
│ │ ├── rakutenfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── rakutenfull_onehot4temporal8length1944feature256.sh
│ │ └── rakutenfull_onehot4temporal8length1944feature256roman.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ └── unittest/
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── model_cuda.lua
│ ├── model_cudnn.lua
│ ├── test.lua
│ ├── test_cuda.lua
│ ├── train.lua
│ └── train_cuda.lua
└── unifont/
├── createunifont.lua
├── unifont/
│ └── README.txt
└── visualize.lua
================================================
FILE CONTENTS
================================================
================================================
FILE: LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2017, Xiang Zhang
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
# Glyph
This repository is used to publish all the code used for the following article:
[Xiang Zhang, Yann LeCun, Which Encoding is the Best for Text Classification in Chinese, English, Japanese and Korean?, arXiv 1708.02657](https://arxiv.org/abs/1708.02657)
The code and datasets are completely released as of January 2018, including all the code for crawling, preprocessing and training on the datasets. However, the documentation may not be complete yet. That said, readers could refer to the `doc` directory for an example in reproducing all the results for the Dianping dataset, and extend that to other datasets in similar ways.
## Reproducibility Manifesto
If anyone sees a number in our paper, there is a script one can execute to reproduce it. No responsibility should be imposed on the user to figure out any experimental parameter barried in the paper's content.
## Datasets
The `data` directory contains the preprocessing scripts for all the datasets used in the paper. These datasets are released separately of their processing source code. See below for details.
### Summary
The following table is a summary of the datasets. Most of them have millions of samples for training.
| Dataset | Language | Classes | Train | Test |
|----------------|--------------|---------|------------|-----------|
| Dianping | Chinese | 2 | 2,000,000 | 500,000 |
| JD full | Chinese | 5 | 3,000,000 | 250,000 |
| JD binary | Chinese | 2 | 4,000,000 | 360,000 |
| Rakuten full | Japanese | 5 | 4,000,000 | 500,000 |
| Rakuten binary | Japanese | 2 | 3,400,000 | 400,000 |
| 11st full | Korean | 5 | 750,000 | 100,000 |
| 11st binary | Korean | 2 | 4,000,000 | 400,000 |
| Amazon full | English | 5 | 3,000,000 | 650,000 |
| Amazon binary | English | 2 | 3,600,000 | 400,000 |
| Ifeng | Chinese | 5 | 800,000 | 50,000 |
| Chinanews | Chinese | 7 | 1,400,000 | 112,000 |
| NYTimes | English | 7 | 1,400,000 | 105,000 |
| Joint full | Multilingual | 5 | 10,750,000 | 1,500,000 |
| Joint binary | Multilingual | 2 | 15,000,000 | 1,560,000 |
### Download
Datasets are released separtely of the source code via links from Google Drive. *These datasets should only be used for the purpose of research*.
| Dataset | Train | Test |
|----------------|--------------------------------|-------------------------------|
| Dianping | [Link](https://goo.gl/uKPxyo) | [Link](https://goo.gl/2QZpLx) |
| JD full | [Link](https://goo.gl/u3vsak) | [Link](https://goo.gl/hLZRky) |
| JD binary | [Link](https://goo.gl/ZPj1ip) | [Link](https://goo.gl/bqiEfP) |
| Rakuten full | [Link](https://goo.gl/A7y14i) | [Link](https://goo.gl/ve4mup) |
| Rakuten binary | [Link](https://goo.gl/3kYQ2f) | [Link](https://goo.gl/m8FpeH) |
| 11st full | [Link](https://goo.gl/F1oPBX) | [Link](https://goo.gl/ZpTLND) |
| 11st binary | [Link](https://goo.gl/8Qi7ao) | [Link](https://goo.gl/nbBhFq) |
| Amazon full | [Link](https://goo.gl/UzQWaj) | [Link](https://goo.gl/EXkzWs) |
| Amazon binary | [Link](https://goo.gl/u7AxWS) | [Link](https://goo.gl/2fft8x) |
| Ifeng | [Link](https://goo.gl/AtKsq4) | [Link](https://goo.gl/tLWojy) |
| Chinanews | [Link](https://goo.gl/1p4kdx) | [Link](https://goo.gl/rxvhCJ) |
| NYTimes | [Link](https://goo.gl/2hZeqd) | [Link](https://goo.gl/66EDa5) |
| Joint full | [Link](https://goo.gl/AJfzLC) | [Link](https://goo.gl/mibMsV) |
| Joint binary | [Link](https://goo.gl/YLMqNe) | [Link](https://goo.gl/WRXQuJ) |
## GNU Unifont
The `glyphnet` scripts require the GNU Unifont character images to run. The file `unifont-8.0.01.t7b.xz` can be downloaded via [this link](https://goo.gl/aFxYHq).
================================================
FILE: data/11st/construct_rr.py
================================================
#!/usr/bin/python3
'''
Convert Korean datasets to Revised Romanization of Korean (RR, MC2000)
Copyright 2016 Xiang Zhang
Usage: python3 construct_hepburn.py -i [input] -o [output]
'''
# Input file
INPUT = '../data/11st/sentiment/full_train.csv'
# Output file
OUTPUT = '../data/11st/sentiment/full_train_rr.csv'
import argparse
import csv
import hanja
import unidecode
# Hangul romanization libraries
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
# Main program
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
transliter = Transliter(academic)
convertRoman(transliter)
def romanizeText(transliter, text):
text = text.strip()
if text != '':
hangul_text = hanja.translate(text, 'substitution')
return transliter.translit(hangul_text)
return text
# Convert the text in Chinese to pintin
def convertRoman(transliter):
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
new_row.append(unidecode.unidecode(romanizeText(
transliter, row[i])).strip().replace('\n','\\n'))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
if __name__ == '__main__':
main()
================================================
FILE: data/11st/create_post.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of reviews
Copyright 2016 Xiang Zhang
Usage: python3 create_post.py -i [input file pattern] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/11st/post/*.json.xz'
OUTPUT = '../data/11st/sentiment/post.csv'
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file pattern', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
createData()
def createData():
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
files = glob.glob(INPUT)
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
review = json.loads(line)
star = review.get('star', '')
title = review.get('title', '')
content = review.get('content', '')
if star != '':
n = n + 1
writer.writerow([star, title.replace('\n', '\\n'),
content.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/11st/create_review.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of reviews
Copyright 2016 Xiang Zhang
Usage: python3 create_review.py -i [input file pattern] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/11st/review/*.json.xz'
OUTPUT = '../data/11st/sentiment/review.csv'
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file pattern', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
createData()
def createData():
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
files = glob.glob(INPUT)
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
review = json.loads(line)
star = review.get('star', '')
title = review.get('title', '')
content = review.get('content', '')
if star != '':
n = n + 1
writer.writerow([star, title.replace('\n', '\\n'),
content.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/11st/segment_rr_word.lua
================================================
--[[
Create romananized word data from romanized data in csv for Korean
Copyright 2016 Xiang Zhang
Usage: th segment_rr_word.lua [input] [output] [list] [read]
--]]
local ffi = require('ffi')
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/11st/sentiment/full_train_rr.csv'
local output = arg[2] or '../data/11st/sentiment/full_train_rr_word.csv'
local list = arg[3] or '../data/11st/sentiment/full_train_rr_word_list.csv'
local read = (arg[4] == 'true')
local word_index, word_total
if read then
print('Reading word index')
word_index, word_total = joe.readWords(list)
else
print('Counting words')
local word_count, word_freq = joe.splitWords(input)
print('Sorting words by count')
word_index, word_total = joe.sortWords(list, word_count, word_freq)
end
print('Constructing word index output')
joe.constructWords(input, output, word_index, word_total)
end
function joe.readWords(list)
local word_index = tds.Hash()
local fd = io.open(list)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: '..n)
io.flush()
end
local content = joe.parseCSVLine(line)
content[1] = content[1]:gsub('\\n', '\n')
word_index[content[1]] = n
end
print('\rProcessed lines: '..n)
fd:close()
return word_index, n
end
function joe.splitWords(input)
local word_count, word_freq = tds.Hash(), tds.Hash()
local fd = io.open(input)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
field_set = {}
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
-- All punctuation characters except for hyphen "-"
content[i] = content[i]:gsub(
'([!"#$%%&\'()*+,./:;<=>?@%[\\%]^_`{|}~])', ' %1 ')
for word in content[i]:gmatch('[%S]+') do
word_count[word] = (word_count[word] or 0) + 1
if not field_set[word] then
field_set[word] = true
word_freq[word] = (word_freq[word] or 0) + 1
end
end
end
end
print('\rProcessed lines: '..n)
fd:close()
-- Normalizing word frequencies
for key, value in pairs(word_freq) do
word_freq[key] = value / n
end
return word_count, word_freq
end
function joe.sortWords(list, word_count, word_freq)
-- Sort the list of words
word_list = tds.Vec()
for word, _ in pairs(word_count) do
word_list[#word_list + 1] = word
end
word_list:sort(function (w, v) return word_count[w] > word_count[v] end)
-- Create the word index
word_index = tds.Hash()
for index, word in ipairs(word_list) do
word_index[word] = index
end
-- Write it to file
fd = io.open(list, 'w')
for index, word in ipairs(word_list) do
fd:write('"', word:gsub("\n", "\\n"):gsub("\"", "\"\""), '","',
word_count[word], '","', word_freq[word], '"\n')
end
return word_index, #word_list
end
function joe.constructWords(input, output, word_index, word_total)
local ifd = io.open(input)
local ofd = io.open(output, 'w')
local n = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
ofd:write('"', content[1], '"')
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
-- All punctuation characters except for hyphen "-"
content[i] = content[i]:gsub(
'([!"#$%%&\'()*+,./:;<=>?@%[\\%]^_`{|}~])', ' %1 ')
local first_write = true
ofd:write(',"')
for word in content[i]:gmatch('[%S]+') do
local index = word_index[word] or word_total + 1
if first_write then
first_write = false
ofd:write(index)
else
ofd:write(' ', index)
end
end
ofd:write('"')
end
ofd:write('\n')
end
print('\rProcessed lines: '..n)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/11st/segment_word.py
================================================
#!/usr/bin/python3
'''
Convert Korean datasets to Index of Words
Copyright 2016 Xiang Zhang
Usage: python3 construct_pinyin.py -i [input] -l [list] -o [output] [-r]
'''
#Input file
INPUT = '../data/11st/sentiment/full_train.csv'
#Output file
OUTPUT = '../data/11st/sentiment/full_train_word.csv'
# List file
LIST = '../data/11st/sentiment/full_train_word_list.csv'
# Read already defined word list
READ = False
# Korean dictionary path for MeCab
MECAB_DICT_PATH = '/home/xiang/.usr/lib/mecab/dic/mecab-ko-dic'
import argparse
import csv
from konlpy.tag import Mecab
# Main program
def main():
global INPUT
global OUTPUT
global LIST
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
parser.add_argument('-l', '--list', help = 'Word list file', default = LIST)
parser.add_argument(
'-r', '--read', help = 'Read from list file', action = 'store_true')
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
LIST = args.list
READ = args.read
if READ:
print('Reading word index')
word_index = readWords()
else:
print('Counting words')
word_count, word_freq = segmentWords()
print('Sorting words by count')
word_index = sortWords(word_count, word_freq)
print('Constructing word index output')
convertWords(word_index)
# Read from pre-existing word list
def readWords():
# Open the files
ifd = open(LIST, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_index = dict()
n = 0
for row in reader:
word = row[0].replace('\\n', '\n')
word_index[word] = n + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
return word_index
# Segment the text in Chinese
def segmentWords():
mecab = Mecab(MECAB_DICT_PATH)
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_count = dict()
word_freq = dict()
n = 0
for row in reader:
field_set = set()
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = mecab.morphs(field)
for word in field_list:
word_count[word] = word_count.get(word, 0) + 1
if word not in field_set:
field_set.add(word)
word_freq[word] = word_freq.get(word, 0) + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
# Normalizing word frequency
for word in word_freq:
word_freq[word] = float(word_freq[word]) / float(n)
return word_count, word_freq
# Sort words for a given count dictionary object
def sortWords(word_count, word_freq):
# Sort the words
word_list = sorted(
word_count, key = lambda word: word_count[word], reverse = True)
# Open the files
ofd = open(LIST, 'w', encoding = 'utf-8', newline = '')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over all the words
word_index = dict()
n = 0
for i in range(len(word_list)):
word = word_list[i]
row = [word.replace('\n', '\\n'), str(word_count[word]),
str(word_freq[word])]
writer.writerow(row)
word_index[word] = i + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing word: {}'.format(n), end = '')
print('\rProcessed words: {}'.format(n))
ofd.close()
return word_index
# Convert the text in Chinese to word list
def convertWords(word_index):
mecab = Mecab(MECAB_DICT_PATH)
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = mecab.morphs(field)
new_row.append(' '.join(map(
str, map(lambda word: word_index.get(word, len(word_index) + 1),
field_list))))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/README.md
================================================
# Datasets
This directory contains the preprocessing scripts for all the datasets used in the paper. These datasets are released separately of their processing source code.
================================================
FILE: data/chinanews/construct_topic.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of news articles
Copyright 2016 Xiang Zhang
Usage: python3 construct_topic.py -i [input directory] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/chinanews/article'
OUTPUT = '../data/chinanews/topic/news.csv'
CATEGORY_FILE = '../data/chinanews/category/category.json'
def main():
global INPUT
global OUTPUT
global CATEGORY_FILE
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file directory', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
parser.add_argument(
'-c', '--category', help = 'Category file', default = CATEGORY_FILE)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
CATEGORY_FILE = args.category
createData()
def createData():
# Open the category file
classes = dict()
cfd = open(CATEGORY_FILE, encoding = 'utf-8')
i = 1
for line in cfd:
category = json.loads(line)
classes[category['code']] = i
i = i + 1
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
for prefix in classes:
files = glob.glob(INPUT + '/' + prefix + '_*.json.xz')
index = classes[prefix]
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
news = json.loads(line)
title = news.get('title', '')
content = news.get('content', list())
abstract = ''
if len(content) > 0:
abstract = content[0]
n = n + 1
writer.writerow([index, title.replace('\n', '\\n'),
abstract.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/data/README.txt
================================================
This directory should contain training and testing datasets.
================================================
FILE: data/dianping/combine_gram_count.lua
================================================
--[[
Combine sorted gram counts
Copyright 2016 Xiang Zhang
Usage: th combine_gram_count.lua [input_prefix] [output] [samples] [chunks]
Comment: This program also outputs lines with counts as the firt unquoted csv
value, so that one can use GNU sort easily.
--]]
local io = require('io')
local math = require('math')
local string = require('string')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input_prefix = arg[1] or '../data/dianping/train_chargram_count_sort/'
local output = arg[2] or '../data/dianping/train_chargram_count_combine.csv'
local samples = arg[3] and tonumber(arg[3]) or 2000000
local chunks = arg[4] and tonumber(arg[4]) or 100
print('Combine chunks')
joe.combineChunks(input_prefix, output, samples, chunks)
end
function joe.combineChunks(input_prefix, output, samples, chunks)
local n = 0
local ofd = io.open(output, 'w')
local current = {}
for i = 1, chunks do
local ifd = io.open(input_prefix..i..'.csv')
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
if current[1] ~= content[1] then
if current[1] ~= nil then
ofd:write(current[3], ',"', current[1], '","',
current[2]:gsub('"', '""'), '","',
current[4] / samples, '","', current[3], '"\n')
end
current = content
else
current[3] = current[3] + content[3]
current[4] = current[4] + content[4]
end
end
ifd:close()
end
ofd:write(current[3], ',"', current[1], '","',
current[2]:gsub('"', '""'), '","',
current[4] / samples, '","', current[3], '"\n')
ofd:close()
print('\rProcessed lines: '..n)
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_charbag.lua
================================================
--[[
Construct unicode character bag-of-element format from unicode serialization
Copyright 2016 Xiang Zhang
Usage: th construct_charbag.lua [input] [output] [list] [read] [limit] [replace]
--]]
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_code.t7b'
local output = arg[2] or '../data/dianping/train_charbag.t7b'
local list = arg[3] or '../data/dianping/train_charbag_list.csv'
local read = (arg[4] == 'true')
local limit = arg[5] and tonumber(arg[5]) or 200000
local replace = arg[6] and tonumber(arg[6]) or 200001
print('Loading data from '..input)
local data = torch.load(input)
print('Counting character')
local count, freq = joe.countBag(data, limit, replace)
print('Total number of values: '..count)
if read == true then
print('Reading frequency from '..list)
freq = joe.readList(list)
else
print('Outputing frequency list to '..list)
joe.writeList(freq, list)
end
print('Constructing character bag data')
local bag = joe.constructBag(data, count, limit, replace)
print('Saving to '..output)
torch.save(output, bag)
end
function joe.writeList(freq, list)
local fd = io.open(list, 'w')
for i = 1, freq:size(1) do
local char = (i <= 65536) and joe.utf8str(i - 1) or ''
-- Do not print control characters
if i < 11 or (i > 11 and i < 33) then
char = ''
end
fd:write('"', i, '","', char:gsub('\n', '\\n'):gsub('"', '""'), '","',
freq[i], '"\n')
end
end
function joe.readList(list)
local freq = {}
local fd = io.open(list)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[2] = content[2]:gsub('\\n', '\n')
freq[#freq + 1] = tonumber(content[3])
end
return torch.Tensor(freq)
end
function joe.countBag(data, limit, replace)
local code, code_value = data.code, data.code_value
local count = 0
local freq = torch.zeros(math.max(limit, replace))
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
end
local index = {}
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
for l = 1, code[i][j][k][2] do
local char = code_value[code[i][j][k][1] + l - 1]
if char > limit then
char = replace
end
if not index[char] then
count = count + 1
index[char] = 1
freq[char] = freq[char] + 1
else
index[char] = index[char] + 1
end
end
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
-- Normalizing the frequency
local sum = 0
for i = 1, #code do
sum = sum + code[i]:size(1)
end
freq:div(sum)
return count, freq
end
function joe.constructBag(data, count, limit, replace)
local code, code_value = data.code, data.code_value
local bag = {}
local bag_index = torch.LongTensor(count)
local bag_value = torch.DoubleTensor(count)
local count = 0
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
bag[i] = torch.LongTensor(code[i]:size(1), 2)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
end
local index = {}
local pointer = {}
bag[i][j][1] = count + 1
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
for l = 1, code[i][j][k][2] do
local char = code_value[code[i][j][k][1] + l - 1]
if char > limit then
char = replace
end
if not index[char] then
count = count + 1
index[char] = 1
pointer[#pointer + 1] = char
else
index[char] = index[char] + 1
end
end
end
table.sort(pointer)
bag[i][j][2] = #pointer
for m = 1, #pointer do
bag_index[bag[i][j][1] + m - 1] = pointer[m]
if pointer[m] > limit then
bag_value[bag[i][j][1] + m - 1] = 0
else
bag_value[bag[i][j][1] + m - 1] = index[pointer[m]]
end
end
if #pointer > 0 and
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum() ~= 0 then
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):div(
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum())
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return {bag = bag, bag_index = bag_index, bag_value = bag_value}
end
joe.bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
function joe.utf8str(decimal)
local bytemarkers = joe.bytemarkers
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_chargram.lua
================================================
--[[
Construct unicode character ngrams format from unicode serialization
Copyright 2016 Xiang Zhang
Usage: th construct_chargram.lua [input] [output] [list] [read] [gram] [limit]
[replace]
--]]
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_code.t7b'
local output = arg[2] or '../data/dianping/train_chargram.t7b'
local list = arg[3] or '../data/dianping/train_chargram_list.csv'
local read = (arg[4] == nil) or(arg[4] == 'true')
local gram = arg[5] and tonumber(arg[5]) or 5
local limit = arg[6] and tonumber(arg[6]) or 1000000
local replace = arg[7] and tonumber(arg[7]) or 1000001
print('Loading data from '..input)
local data = torch.load(input)
local freq, dict, ngrams
if read == true then
print('Reading frequency from '..list)
freq, dict = joe.readList(list)
else
print('Constructing dictionary and frequency list')
freq, dict, ngrams = joe.constructList(data, gram)
print('Outputing frequency list to '..list)
joe.writeList(freq, ngrams, list)
end
print('Counting character ngrams data')
local count = joe.countBag(data, dict, gram, limit, replace)
print('Total number of ngrams in data is '..count)
print('Constructing character bag data')
local bag = joe.constructBag(data, dict, count, gram, limit, replace)
print('Saving to '..output)
torch.save(output, bag)
end
function joe.constructList(data, gram)
local count = tds.Hash()
local docs = tds.Hash()
local code, code_value = data.code, data.code_value
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
collectgarbage()
end
local index = {}
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
-- Iterate through the grams
for n = 1, gram do
-- Iterate through the positions
for l = 1, code[i][j][k][2] - n + 1 do
local ngram = tostring(code_value[code[i][j][k][1] + l - 1])
for m = 2, n do
ngram = ngram..' '..tostring(
code_value[code[i][j][k][1] + l - 1 + m - 1])
end
if not index[ngram] then
docs[ngram] = (docs[ngram] or 0) + 1
index[ngram] = 0
end
index[ngram] = index[ngram] + 1
count[ngram] = (count[ngram] or 0) + 1
end
end
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
local ngrams = tds.Vec()
for ngram, value in pairs(count) do
ngrams[#ngrams + 1] = ngram
end
ngrams:sort(function(a, b) return count[a] > count[b] end)
local sum = 0
for i = 1, #code do
sum = sum + code[i]:size(1)
end
local dict = tds.Hash()
local freq = torch.Tensor(#ngrams)
for index, ngram in ipairs(ngrams) do
dict[ngram] = index
freq[index] = (docs[ngram] or 0) / sum
end
return freq, dict, ngrams
end
function joe.writeList(freq, ngrams, list)
local fd = io.open(list, 'w')
for i = 1, freq:size(1) do
local ngram_string = ''
for code in ngrams[i]:gmatch('[%S]+') do
local code = tonumber(code)
local char = (code <= 65536 and (code > 32 or code == 11)) and
joe.utf8str(code - 1) or ' '
ngram_string = ngram_string..char
end
fd:write('"', ngrams[i], '","',
ngram_string:gsub('\n', '\\n'):gsub('"', '""'), '","',
freq[i], '"\n')
end
end
function joe.readList(list)
local freq_table = tds.Vec()
local dict = tds.Hash()
local fd = io.open(list)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[2] = content[2]:gsub('\\n', '\n')
freq_table[#freq_table + 1] = tonumber(content[3])
dict[content[1]] = #freq_table
end
local freq = torch.Tensor(#freq_table)
for i, v in ipairs(freq_table) do
freq[i] = v
end
return freq, dict
end
function joe.countBag(data, dict, gram, limit, replace)
local count = 0
local code, code_value = data.code, data.code_value
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
collectgarbage()
end
local index = {}
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
-- Iterate through the grams
for n = 1, gram do
-- Iterate through the positions
for l = 1, code[i][j][k][2] - n + 1 do
local ngram = tostring(code_value[code[i][j][k][1] + l - 1])
for m = 2, n do
ngram = ngram..' '..tostring(
code_value[code[i][j][k][1] + l - 1 + m - 1])
end
local ngram_index = dict[ngram]
if ngram_index == nil or ngram_index > limit then
ngram_index = replace
end
if not index[ngram_index] then
index[ngram_index] = 0
count = count + 1
end
index[ngram_index] = index[ngram_index] + 1
end
end
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return count
end
function joe.constructBag(data, dict, count, gram, limit, replace)
local code, code_value = data.code, data.code_value
local bag = {}
local bag_index = torch.LongTensor(count)
local bag_value = torch.DoubleTensor(count)
local count = 0
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
bag[i] = torch.LongTensor(code[i]:size(1), 2)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
collectgarbage()
end
local index = {}
local pointer = {}
bag[i][j][1] = count + 1
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
-- Iterate through the grams
for n = 1, gram do
-- Iterate through the positions
for l = 1, code[i][j][k][2] - n + 1 do
local ngram = tostring(code_value[code[i][j][k][1] + l - 1])
for m = 2, n do
ngram = ngram..' '..tostring(
code_value[code[i][j][k][1] + l - 1 + m - 1])
end
local ngram_index = dict[ngram]
if ngram_index == nil or ngram_index > limit then
ngram_index = replace
end
if not index[ngram_index] then
count = count + 1
index[ngram_index] = 0
pointer[#pointer + 1] = ngram_index
end
index[ngram_index] = index[ngram_index] + 1
end
end
end
table.sort(pointer)
bag[i][j][2] = #pointer
for m = 1, #pointer do
bag_index[bag[i][j][1] + m - 1] = pointer[m]
if pointer[m] > limit then
bag_value[bag[i][j][1] + m - 1] = 0
else
bag_value[bag[i][j][1] + m - 1] = index[pointer[m]]
end
end
if #pointer > 0 and
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum() ~= 0 then
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):div(
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum())
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return {bag = bag, bag_index = bag_index, bag_value = bag_value}
end
joe.bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
function joe.utf8str(decimal)
local bytemarkers = joe.bytemarkers
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_chartoken.lua
================================================
--[[
Create chartoken format for fastText
Copyright 2017 Xiang Zhang
Usage: th construct_chartoken.lua [input] [output]
--]]
local bit32 = require('bit32')
local io = require('io')
local math = require('math')
local string = require('string')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train.csv'
local output = arg[2] or '../data/dianping/train_chartoken.txt'
print('Construct token')
joe.constructToken(input, output)
end
function joe.constructToken(input, output)
local ifd = io.open(input)
local ofd = io.open(output, 'w')
local n = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
ofd:write('__label__', class)
for i = 2, #content do
content[i] = content[i]:gsub('\\n', ' '):gsub(
'[%z\001-\031\127]', ' '):gsub('^%s*(.-)%s*$', '%1')
local sequence = joe.utf8to32(content[i])
for j, code in ipairs(sequence) do
if code > 32 then
ofd:write(' ', joe.utf8str(code))
end
end
end
ofd:write('\n')
end
print('\rProcessed lines: '..n)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
-- UTF-8 decoding function
-- Ref: http://lua-users.org/wiki/LuaUnicode
function joe.utf8to32(utf8str)
assert(type(utf8str) == 'string')
local res, seq, val = {}, 0, nil
for i = 1, #utf8str do
local c = string.byte(utf8str, i)
if seq == 0 then
table.insert(res, val)
seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or
c < 0xF8 and 4 or --c < 0xFC and 5 or c < 0xFE and 6 or
error('Invalid UTF-8 character sequence')
val = bit32.band(c, 2^(8-seq) - 1)
else
val = bit32.bor(bit32.lshift(val, 6), bit32.band(c, 0x3F))
end
seq = seq - 1
end
table.insert(res, val)
table.insert(res, 0)
return res
end
-- UTF-8 encoding function
-- Ref: http://stackoverflow.com/questions/7983574/how-to-write-a-unicode-symbol
-- -in-lua
function joe.utf8str(decimal)
local bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_code.lua
================================================
--[[
Construct unicode serialization format from string serialization format
Copyright 2015-2016 Xiang Zhang
Usage: th construct_code.lua [input] [output] [limit] [replace]
--]]
local bit32 = require('bit32')
local ffi = require('ffi')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_string.t7b'
local output = arg[2] or '../data/dianping/train_code.t7b'
local limit = arg[3] and tonumber(arg[3]) or 65536
local replace = arg[4] and tonumber(arg[4]) or 33
print('Loading data from '..input)
local data = torch.load(input)
print('Counting UTF-8 code')
local count = joe.countCode(data)
print('Total number of codes: '..count)
print('Constructing UTF-8 code data')
local code = joe.constructCode(data, count, limit, replace)
print('Saving to '..output)
torch.save(output, code)
end
function joe.countCode(data)
local index, content = data.index, data.content
local count = 0
-- Iterate through the classes
for i = 1, #index do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, index[i]:size(1) do
if math.fmod(j, 10000) == 0 then
io.write('\rProcessing text: ', j, '/', index[i]:size(1))
io.flush()
end
-- Iterate through the fields
for k = 1, index[i][j]:size(1) do
local text = ffi.string(
torch.data(content:narrow(1, index[i][j][k][1], 1)))
local sequence = joe.utf8to32(text)
count = count + #sequence
end
end
print('\rProcessed texts: '..index[i]:size(1)..'/'..index[i]:size(1))
end
return count
end
function joe.constructCode(data, count, limit, replace)
local index, content = data.index, data.content
local code = {}
local code_value = torch.LongTensor(count)
local p = 1
-- Iterate through the classes
for i = 1, #index do
print('Processing for class '..i)
code[i] = index[i]:clone():zero()
-- Iterate through the samples
for j = 1, index[i]:size(1) do
if math.fmod(j, 10000) == 0 then
io.write('\rProcessing text: ', j, '/', index[i]:size(1))
io.flush()
end
-- Iterate through the fields
for k = 1, index[i][j]:size(1) do
local text = ffi.string(
torch.data(content:narrow(1, index[i][j][k][1], 1)))
local sequence = joe.utf8to32(text)
code[i][j][k][1] = p
code[i][j][k][2] = #sequence
for l = 1, #sequence do
code_value[p + l - 1] = sequence[l] + 1
if limit and code_value[p + l - 1] > limit then
code_value[p + l - 1] = replace
end
end
p = p + #sequence
end
end
print('\rProcessed texts: '..index[i]:size(1)..'/'..index[i]:size(1))
end
return {code = code, code_value = code_value}
end
-- UTF-8 decoding function
-- Ref: http://lua-users.org/wiki/LuaUnicode
function joe.utf8to32(utf8str)
assert(type(utf8str) == 'string')
local res, seq, val = {}, 0, nil
for i = 1, #utf8str do
local c = string.byte(utf8str, i)
if seq == 0 then
table.insert(res, val)
seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or
c < 0xF8 and 4 or --c < 0xFC and 5 or c < 0xFE and 6 or
error('Invalid UTF-8 character sequence')
val = bit32.band(c, 2^(8-seq) - 1)
else
val = bit32.bor(bit32.lshift(val, 6), bit32.band(c, 0x3F))
end
seq = seq - 1
end
table.insert(res, val)
table.insert(res, 0)
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_pinyin.py
================================================
#!/usr/bin/python3
'''
Convert Chinese datasets to Pinyin format
Copyright 2016 Xiang Zhang
Usage: python3 construct_pinyin.py -i [input] -o [output]
'''
#Input file
INPUT = '../data/dianping/train.csv'
#Output file
OUTPUT = '../data/dianping/train_pinyin.csv'
import argparse
import csv
import pypinyin
import unidecode
# Main program
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
convertPinyin()
# Convert the text in Chinese to pintin
def convertPinyin():
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
new_row.append(' '.join(map(
str.strip,
map(lambda s: s.replace('\n', '\\n'),
map(unidecode.unidecode,
pypinyin.lazy_pinyin(
row[i], style = pypinyin.TONE2))))))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
if __name__ == '__main__':
main()
================================================
FILE: data/dianping/construct_reviews.lua
================================================
--[[
Create reviews in csv format from original txt file
Copyright 2015-2016 Xiang Zhang
Usage: th construct_reviews [input] [output]
--]]
local cjson = require('cjson')
local io = require('io')
local math = require('math')
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/reviews.txt'
local output = arg[2] or '../data/dianping/reviews.csv'
local ifd = io.open(input)
local ofd = io.open(output, "w")
local n = 0
local valid = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n, ', valid: ', valid)
io.flush()
end
-- Skip the first line
if n > 1 then
-- Break content to url and json
local point = line:find('%^')
local data = line:sub(point + 2):gsub("^%s*(.-)%s*$", "%1")
-- Parse the data
local parsed = cjson.decode(data)
local content = parsed.content:gsub("^%s*(.-)%s*$", "%1")
local rate = tonumber(parsed.rate)
-- Record to csv
if rate and rate >= 0 and #content > 0 then
valid = valid + 1
content = content:gsub("\n", "\\n"):gsub("\"", "\"\"")
ofd:write('"'..rate..'","'..content..'"\n')
end
end
end
ifd:close()
ofd:close()
print('\rProcessed lines: '..n..', valid: '..valid)
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_string.lua
================================================
--[[
Create string serialization format from csv files
Copyright 2015-2016 Xiang Zhang
Usage: th construct_string.lua [input] [output]
--]]
local ffi = require('ffi')
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train.csv'
local output = arg[2] or '../data/dianping/train_string.t7b'
print('Counting samples')
local count, bytes, fields = joe.countSamples(input)
for i, v in ipairs(count) do
print('Number of samples in class '..i..': '..v)
end
print('Total number of bytes: '..bytes)
print('Number of text fields: '..fields)
print('Constructing data')
local data = joe.constructData(input, count, bytes, fields)
print('Saving to '..output)
torch.save(output, data)
end
function joe.countSamples(input)
local count = {}
local bytes = 0
local fields = nil
local n = 0
local fd = io.open(input)
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
count[class] = count[class] and count[class] + 1 or 1
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
bytes = bytes + content[i]:len() + 1
end
fields = fields or #content - 1
if fields ~= #content - 1 then
error('Number of fields is not '..fields..' at line '..n)
end
end
print('\rProcessed lines: '..n)
fd:close()
return count, bytes, fields
end
function joe.constructData(input, count, bytes, fields)
local data = torch.ByteTensor(bytes)
local index = {}
for i, v in ipairs(count) do
index[i] = torch.LongTensor(v, fields, 2)
end
local progress = {}
local n = 0
local p = 1
local fd = io.open(input)
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
progress[class] = progress[class] and progress[class] + 1 or 1
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
index[class][progress[class]][i - 1][1] = p
index[class][progress[class]][i - 1][2] = content[i]:len()
ffi.copy(torch.data(data:narrow(1, p, content[i]:len() + 1)),
content[i])
p = p + content[i]:len() + 1
end
end
print('\rProcessed lines: '..n)
fd:close()
return {content = data, index = index}
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_tfidf.lua
================================================
--[[
Construct tfidf format from bag format
Copyright 2016 Xiang Zhang
Usage: th construct_tfidf.lua [input] [output] [list] [limit]
--]]
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_charbag.t7b'
local output = arg[2] or '../data/dianping/train_charbagtfidf.t7b'
local list = arg[3] or '../data/dianping/train_charbag_list.csv'
local limit = arg[4] and tonumber(arg[4]) or 200000
print('Loading data from '..input)
local data = torch.load(input)
print('Loading frequency list from '..list)
local freq = joe.readList(list)
print('Frequency list length '..freq:size(1))
print('Constructing bag-of-elements TFIDF data')
local tfidf = joe.constructTfidf(data, freq, limit)
print('Saving to '..output)
torch.save(output, tfidf)
end
function joe.readList(list)
local freq = {}
local fd = io.open(list)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[2] = content[2]:gsub('\\n', '\n')
freq[#freq + 1] = tonumber(content[3])
end
return torch.Tensor(freq)
end
function joe.constructTfidf(data, freq, limit)
local bag, bag_index, bag_value = data.bag, data.bag_index, data.bag_value
local tfidf_value = bag_value:clone()
local freq = freq
if freq:size(1) > limit then
freq:narrow(1, limit + 1, freq:size(1) - limit):zero()
elseif freq:size(1) < limit + 1 then
local new_freq = freq.new(limit + 1):zero()
new_freq:narrow(1, 1, freq:size(1)):copy(freq)
freq = new_freq
end
freq:apply(function (x) return x > 0 and math.log(1/x) or 0 end)
local indexed = freq:index(1, bag_index)
tfidf_value:cmul(indexed)
-- Iterate through the classes
for i = 1, #bag do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, bag[i]:size(1) do
if math.fmod(j, 10000) == 0 then
io.write('\rProcessing sample: ', j, '/', bag[i]:size(1))
io.flush()
end
if bag[i][j][2] > 0 and
tfidf_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum() ~= 0 then
tfidf_value:narrow(1, bag[i][j][1], bag[i][j][2]):div(
tfidf_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum())
end
end
print('\rProcessed samples: '..bag[i]:size(1)..'/'..bag[i]:size(1))
end
return {bag = bag, bag_index = bag_index, bag_value = tfidf_value}
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_word.lua
================================================
--[[
Create word serialization format from csv files
Copyright 2015-2016 Xiang Zhang
Usage: th construct_word.lua [input] [output]
--]]
local ffi = require('ffi')
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_word.csv'
local output = arg[2] or '../data/dianping/train_word.t7b'
print('Counting samples')
local count, length, fields = joe.countSamples(input)
for i, v in ipairs(count) do
print('Number of samples in class '..i..': '..v)
end
print('Total number of words: '..length)
print('Number of text fields: '..fields)
print('Constructing data')
local data = joe.constructData(input, count, length, fields)
print('Saving to '..output)
torch.save(output, data)
end
function joe.countSamples(input)
local count = {}
local length = 0
local fields = nil
local n = 0
local fd = io.open(input)
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
count[class] = count[class] and count[class] + 1 or 1
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub('^%s*(.-)%s*$', '%1')
local _, current_length = content[i]:gsub('(%d+)', '%1')
length = length + current_length
end
fields = fields or #content - 1
if fields ~= #content - 1 then
error('Number of fields is not '..fields..' at line '..n)
end
end
print('\rProcessed lines: '..n)
fd:close()
return count, length, fields
end
function joe.constructData(input, count, length, fields)
local data = torch.LongTensor(length)
local index = {}
for i, v in ipairs(count) do
index[i] = torch.LongTensor(v, fields, 2)
end
local progress = {}
local n = 0
local p = 1
local fd = io.open(input)
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
progress[class] = progress[class] and progress[class] + 1 or 1
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub('^%s*(.-)%s*$', '%1')
index[class][progress[class]][i - 1][1] = p
local current_length = 0
for word in content[i]:gmatch('%d+') do
data[p] = tonumber(word)
p = p + 1
end
index[class][progress[class]][i - 1][2] =
p - index[class][progress[class]][i - 1][1]
end
end
print('\rProcessed lines: '..n)
fd:close()
return {code = index, code_value = data}
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_wordbag.lua
================================================
--[[
Construct word bag-of-element format
Copyright 2016 Xiang Zhang
Usage: th construct_wordbag.lua [input] [output] [limit] [replace]
--]]
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_word.t7b'
local output = arg[2] or '../data/dianping/train_wordbag.t7b'
local limit = arg[3] and tonumber(arg[3]) or 200000
local replace = arg[4] and tonumber(arg[4]) or 200001
print('Loading data from '..input)
local data = torch.load(input)
print('Counting words')
local count = joe.countBag(data, limit, replace)
print('Total number of values: '..count)
print('Constructing word bag data')
local bag = joe.constructBag(data, count, limit, replace)
print('Saving to '..output)
torch.save(output, bag)
end
function joe.countBag(data, limit, replace)
local code, code_value = data.code, data.code_value
local count = 0
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
end
local index = {}
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
for l = 1, code[i][j][k][2] do
local word = code_value[code[i][j][k][1] + l - 1]
if word > limit then
word = replace
end
if not index[word] then
count = count + 1
index[word] = 1
else
index[word] = index[word] + 1
end
end
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return count
end
function joe.constructBag(data, count, limit, replace)
local code, code_value = data.code, data.code_value
local bag = {}
local bag_index = torch.LongTensor(count)
local bag_value = torch.DoubleTensor(count)
local count = 0
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
bag[i] = torch.LongTensor(code[i]:size(1), 2)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
end
local index = {}
local pointer = {}
bag[i][j][1] = count + 1
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
for l = 1, code[i][j][k][2] do
local word = code_value[code[i][j][k][1] + l - 1]
if word > limit then
word = replace
end
if not index[word] then
count = count + 1
index[word] = 1
pointer[#pointer + 1] = word
else
index[word] = index[word] + 1
end
end
end
table.sort(pointer)
bag[i][j][2] = #pointer
for m = 1, #pointer do
bag_index[bag[i][j][1] + m - 1] = pointer[m]
if pointer[m] > limit then
bag_value[bag[i][j][1] + m - 1] = 0
else
bag_value[bag[i][j][1] + m - 1] = index[pointer[m]]
end
end
if #pointer > 0 and
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum() ~= 0 then
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):div(
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum())
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return {bag = bag, bag_index = bag_index, bag_value = bag_value}
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_wordgram.lua
================================================
--[[
Constructngrams format from serialization
Copyright 2016 Xiang Zhang
Usage: th construct_wordgram.lua [input] [output] [list] [gram] [limit]
[replace]
--]]
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_word.t7b'
local output = arg[2] or '../data/dianping/train_wordgram.t7b'
local list = arg[3] or '../data/dianping/train_wordgram_list.csv'
local gram = arg[4] and tonumber(arg[4]) or 5
local limit = arg[5] and tonumber(arg[5]) or 1000000
local replace = arg[6] and tonumber(arg[6]) or 1000001
print('Loading data from '..input)
local data = torch.load(input)
print('Reading frequency from '..list)
local freq, dict = joe.readList(list)
print('Counting character ngrams data')
local count = joe.countBag(data, dict, gram, limit, replace)
print('Total number of ngrams in data is '..count)
print('Constructing character bag data')
local bag = joe.constructBag(data, dict, count, gram, limit, replace)
print('Saving to '..output)
torch.save(output, bag)
end
function joe.readList(list)
local freq_table = tds.Vec()
local dict = tds.Hash()
local fd = io.open(list)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[2] = content[2]:gsub('\\n', '\n')
freq_table[#freq_table + 1] = tonumber(content[3])
dict[content[1]] = #freq_table
end
local freq = torch.Tensor(#freq_table)
for i, v in ipairs(freq_table) do
freq[i] = v
end
return freq, dict
end
function joe.countBag(data, dict, gram, limit, replace)
local count = 0
local code, code_value = data.code, data.code_value
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
collectgarbage()
end
local index = {}
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
-- Iterate through the grams
for n = 1, gram do
-- Iterate through the positions
for l = 1, code[i][j][k][2] - n + 1 do
local ngram = tostring(code_value[code[i][j][k][1] + l - 1])
for m = 2, n do
ngram = ngram..' '..tostring(
code_value[code[i][j][k][1] + l - 1 + m - 1])
end
local ngram_index = dict[ngram]
if ngram_index == nil or ngram_index > limit then
ngram_index = replace
end
if not index[ngram_index] then
index[ngram_index] = 0
count = count + 1
end
index[ngram_index] = index[ngram_index] + 1
end
end
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return count
end
function joe.constructBag(data, dict, count, gram, limit, replace)
local code, code_value = data.code, data.code_value
local bag = {}
local bag_index = torch.LongTensor(count)
local bag_value = torch.DoubleTensor(count)
local count = 0
-- Iterate through the classes
for i = 1, #code do
print('Processing for class '..i)
bag[i] = torch.LongTensor(code[i]:size(1), 2)
-- Iterate through the samples
for j = 1, code[i]:size(1) do
if math.fmod(j, 1000) == 0 then
io.write('\rProcessing text: ', j, '/', code[i]:size(1))
io.flush()
collectgarbage()
end
local index = {}
local pointer = {}
bag[i][j][1] = count + 1
-- Iterate through the fields
for k = 1, code[i][j]:size(1) do
-- Iterate through the grams
for n = 1, gram do
-- Iterate through the positions
for l = 1, code[i][j][k][2] - n + 1 do
local ngram = tostring(code_value[code[i][j][k][1] + l - 1])
for m = 2, n do
ngram = ngram..' '..tostring(
code_value[code[i][j][k][1] + l - 1 + m - 1])
end
local ngram_index = dict[ngram]
if ngram_index == nil or ngram_index > limit then
ngram_index = replace
end
if not index[ngram_index] then
count = count + 1
index[ngram_index] = 0
pointer[#pointer + 1] = ngram_index
end
index[ngram_index] = index[ngram_index] + 1
end
end
end
table.sort(pointer)
bag[i][j][2] = #pointer
for m = 1, #pointer do
bag_index[bag[i][j][1] + m - 1] = pointer[m]
if pointer[m] > limit then
bag_value[bag[i][j][1] + m - 1] = 0
else
bag_value[bag[i][j][1] + m - 1] = index[pointer[m]]
end
end
if #pointer > 0 and
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum() ~= 0 then
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):div(
bag_value:narrow(1, bag[i][j][1], bag[i][j][2]):sum())
end
end
print('\rProcessed texts: '..code[i]:size(1)..'/'..code[i]:size(1))
end
return {bag = bag, bag_index = bag_index, bag_value = bag_value}
end
joe.bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
function joe.utf8str(decimal)
local bytemarkers = joe.bytemarkers
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/construct_wordtoken.lua
================================================
--[[
Construct word token format from csv files
Copyright 2017 Xiang Zhang
Usage: th construct_wordtoken [input] [list] [output]
--]]
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_word.csv'
local list = arg[2] or '../data/dianping/train_word_list.csv'
local output = arg[3] or '../data/dianping/train_wordtoken.txt'
print('Reading list from '..list)
local word_list = joe.readList(list)
print('Constructing word token')
joe.constructToken(input, output, word_list)
end
function joe.readList(list)
local word_list = tds.Vec()
local fd = io.open(list)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
word_list[#word_list + 1] =
content[1]:gsub('\\n', '\n'):gsub('[%z\001-\032\127]', ' '):gsub(
'^%s*(.-)%s*$', '%1')
end
print('\rProcessed lines: '..n)
fd:close()
return word_list
end
function joe.constructToken(input, output, word_list)
local ifd = io.open(input)
local ofd = io.open(output, 'w')
local n = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
ofd:write('__label__', class)
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub('^%s*(.-)%s*$', '%1')
for word in content[i]:gmatch('%d+') do
local word_string = word_list[tonumber(word)] or '<unk>'
ofd:write(' ', word_string)
end
end
ofd:write('\n')
end
print('\rProcessed lines: '..n)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/convert_string_code.lua
================================================
--[[
Convert string serialization to code
Copyright 2016 Xiang Zhang
Usage: th convert_string_code.lua [input] [output]
--]]
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_string.t7b'
local output = arg[2] or '../data/dianping/train_string_code.t7b'
print('Reading from '..input)
local input_data = torch.load(input)
print('Converting to code format')
local output_data = joe.convert(input_data)
print('Saving to '..output)
torch.save(output, output_data)
end
function joe.convert(input_data)
local output_data = {}
output_data.code = input_data.index
output_data.code_value = input_data.content
return output_data
end
joe.main()
return joe
================================================
FILE: data/dianping/count_chargram.lua
================================================
--[[
Parallelized chargram counting program
Copyright Xiang Zhang 2016
Usage: th count_chargram.lua [input] [output_prefix] [grams] [chunks] [threads]
[batch] [buffer]
Comment: This program is a map-reduce like process. During map, each sample is
separated into character-ngrams. During reduce, these character-ngrams are
aggregated per-batch samples and output to file chunks. Which files chunk to
put the gram is determined by a hash value of the gram string, therefore
instances of the same gram always end up in the same file. This program is
necessary because a linear aggregation program can easily overflow memory for
several millions of samples.
--]]
local hash = require('hash')
local io = require('io')
local math = require('math')
local tds = require('tds')
local threads = require('threads')
local torch = require('torch')
local Queue = require('queue')
-- Library configurations
threads.serialization('threads.sharedserialize')
-- A Logic Named Joe
local joe = {}
-- Constant values
joe.SEED = 0
-- Main program entry
function joe.main()
local input = arg[1] or '../data/dianping/train_code.t7b'
local output_prefix = arg[2] or '../data/dianping/train_chargram_count/'
local num_grams = arg[3] and tonumber(arg[3]) or 5
local chunks = arg[4] and tonumber(arg[4]) or 100
local num_threads = arg[5] and tonumber(arg[5]) or 10
local batch = arg[6] and tonumber(arg[6]) or 100000
local buffer = arg[7] and tonumber(arg[7]) or 1000
print('Loading data from '..input)
local data = torch.load(input)
print('Opening output files with prefix '..output_prefix)
local fds = {}
for i = 1, chunks do
fds[i] = io.open(output_prefix..tostring(i)..'.csv', 'w')
end
joe.fds = fds
print('Setting finished threads to 0')
joe.finished = 0
print('Creating record')
joe.record = tds.Hash()
print('Setting item counter to 0')
joe.count = 0
print('Storing options')
joe.batch = batch
print('Creating queues')
local queue = Queue(buffer)
print('Creating mutex')
local mutex = threads.Mutex()
print('Creating '..num_threads..' threads')
local init_thread = joe.initThread()
local block = threads.Threads(num_threads, init_thread)
block:specific(true)
print('Deploying thread jobs')
joe.deployThreads(data, num_grams, queue, mutex, block, num_threads)
print('Entering main thread loop')
while joe.finished < num_threads do
local rpc = queue:pop()
joe[rpc.func](unpack(rpc.arg))
end
if math.fmod(joe.count, batch) ~= 0 then
print('Writing records to files at '..joe.count)
joe.writeRecord()
end
print('Destroying mutex')
mutex:free()
print('Closing files')
for _, fd in ipairs(fds) do
fd:close()
end
print('Synchronizing and terminating the threads')
block:synchronize()
block:terminate()
end
-- Thread initialization callback
function joe.initThread()
return function ()
local torch = require('torch')
local Queue = require('queue')
end
end
-- Thread job deploying threads
function joe.deployThreads(data, num_grams, queue, mutex, block, num_threads)
local progress = torch.LongTensor(2)
progress[1] = 1
progress[2] = 0
for i = 1, num_threads do
print('Deploying job for thread '..i)
local thread_job = joe.threadJob(
data, num_grams, queue, mutex:id(), progress, i)
block:addjob(i, thread_job)
local rpc = queue:pop()
while rpc.func ~= 'notifyDeploy' do
joe[rpc.func](unpack(rpc.arg))
rpc = queue:pop()
end
print('rpc = notifyDeploy, thread = '..rpc.arg[1])
end
end
-- Write records to file
function joe.writeRecord()
for code, item in pairs(joe.record) do
local chunk = hash.hash(code, joe.SEED, #joe.fds) + 1
joe.fds[chunk]:write(
'"', code, '","', item[1]:gsub('\n', '\\n'):gsub('"', '""'), '","',
item[2], '","', item[3], '"\n')
end
joe.record = tds.Hash()
collectgarbage()
end
-- Thread job
function joe.threadJob(data, num_grams, queue, mutex_id, progress, thread_id)
local utf8str = joe.utf8str()
return function()
local math = require('math')
local string = require('string')
local threads = require('threads')
local mutex = threads.Mutex(mutex_id)
-- Notify the deployment
queue:push{func = 'notifyDeploy', arg = {__threadid}}
local code, code_value = data.code, data.code_value
local class, item
-- Obtain next sample
local function nextSample()
mutex:lock()
if code[progress[1]] == nil then
class = progress[1]
item = progress[2]
elseif code[progress[1]]:size(1) < progress[2] + 1 then
progress[1] = progress[1] + 1
progress[2] = 1
class = progress[1]
item = progress[2]
else
progress[2] = progress[2] + 1
class = progress[1]
item = progress[2]
end
mutex:unlock()
end
local n = 0
nextSample()
while code[class] ~= nil do
n = n + 1
if math.fmod(n, 100) == 0 then
queue:push{
func = 'print',
arg = {__threadid,
'Processing class '..class..', item '..item..
', total '..n}}
collectgarbage()
end
local term_count, doc_count = {}, {}
-- Iterate through the fields
for i = 1, code[class][item]:size(1) do
-- Iterate through the grams
for j = 1, num_grams do
-- Iterate through the positions
for k = 1, code[class][item][i][2] - j + 1 do
local code_string = tostring(
code_value[code[class][item][i][1] + k - 1])
for l = 2, j do
code_string = code_string..' '..tostring(
code_value[code[class][item][i][1] + k - 1 + l - 1])
end
if not term_count[code_string] then
term_count[code_string] = 1
doc_count[code_string] = 1
else
term_count[code_string] = term_count[code_string] + 1
end
end
end
end
-- Compress record to data
local items = {}
for code_string, _ in pairs(term_count) do
local gram_string = ''
for value in code_string:gmatch('[%S]+') do
local value = tonumber(value)
gram_string = gram_string..
((value <= 65536 and (value > 32 or value == 11)) and
utf8str(value - 1) or ' ')
end
items[#items + 1] = {
code_string, gram_string, term_count[code_string],
doc_count[code_string]}
end
-- Send data to record
queue:push{func = 'recordItem', arg = {__threadid, items}}
nextSample()
end
-- Notify main thread that this thread has ended
queue:push{func = 'notifyExit', arg = {__threadid}}
end
end
-- Record item
function joe.recordItem(thread_id, items)
for _, item in pairs(items) do
if joe.record[item[1]] then
joe.record[item[1]][2] = joe.record[item[1]][2] + item[3]
joe.record[item[1]][3] = joe.record[item[1]][3] + item[4]
else
joe.record[item[1]] = tds.Vec{item[2], item[3], item[4]}
end
end
joe.count = joe.count + 1
-- Check write
if math.fmod(joe.count, joe.batch) == 0 then
print('Writing records to files at '..joe.count)
joe.writeRecord()
end
end
-- Print information
function joe.print(thread_id, message)
print('rpc = print, thread = '..thread_id..', message = '..message)
end
-- Notify exit
function joe.notifyExit(thread_id)
joe.finished = joe.finished + 1
print('rpc = notifyExit, thread = '..thread_id..
', finished = '..joe.finished)
end
-- UTF-8 encoding function
-- Ref: http://stackoverflow.com/questions/7983574/how-to-write-a-unicode-symbol
-- -in-lua
function joe.utf8str()
local bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
return function (decimal)
local string = require('string')
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
end
joe.main()
return joe
================================================
FILE: data/dianping/count_wordgram.lua
================================================
--[[
Parallelized wordgram counting program
Copyright Xiang Zhang 2016
Usage: th count_wordgram.lua [input] [output_prefix] [list] [grams] [chunks]
[threads] [batch] [buffer]
Comment: This program is a map-reduce like process. During map, each sample is
separated into character-ngrams. During reduce, these character-ngrams are
aggregated per-batch samples and output to file chunks. Which files chunk to
put the gram is determined by a hash value of the gram string, therefore
instances of the same gram always end up in the same file. This program is
necessary because a linear aggregation program can easily overflow memory for
several millions of samples.
--]]
local hash = require('hash')
local io = require('io')
local math = require('math')
local tds = require('tds')
local threads = require('threads')
local torch = require('torch')
local Queue = require('queue')
-- Library configurations
threads.serialization('threads.sharedserialize')
-- A Logic Named Joe
local joe = {}
-- Constant values
joe.SEED = 0
-- Main program entry
function joe.main()
local input = arg[1] or '../data/dianping/train_word.t7b'
local output_prefix = arg[2] or '../data/dianping/train_wordgram_count/'
local list = arg[3] or '../data/dianping/train_word_list.csv'
local num_grams = arg[4] and tonumber(arg[4]) or 5
local chunks = arg[5] and tonumber(arg[5]) or 100
local num_threads = arg[6] and tonumber(arg[6]) or 10
local batch = arg[7] and tonumber(arg[7]) or 100000
local buffer = arg[8] and tonumber(arg[8]) or 1000
print('Loading data from '..input)
local data = torch.load(input)
print('Loading list from '..list)
local freq, word_list = joe.readList(list)
print('Opening output files with prefix '..output_prefix)
local fds = {}
for i = 1, chunks do
fds[i] = io.open(output_prefix..tostring(i)..'.csv', 'w')
end
joe.fds = fds
print('Setting finished threads to 0')
joe.finished = 0
print('Creating record')
joe.record = tds.Hash()
print('Setting item counter to 0')
joe.count = 0
print('Storing options')
joe.batch = batch
print('Creating queues')
local queue = Queue(buffer)
print('Creating mutex')
local mutex = threads.Mutex()
print('Creating '..num_threads..' threads')
local init_thread = joe.initThread()
local block = threads.Threads(num_threads, init_thread)
block:specific(true)
print('Deploying thread jobs')
joe.deployThreads(
data, word_list, num_grams, queue, mutex, block, num_threads)
print('Entering main thread loop')
while joe.finished < num_threads do
local rpc = queue:pop()
joe[rpc.func](unpack(rpc.arg))
end
if math.fmod(joe.count, batch) ~= 0 then
print('Writing records to files at '..joe.count)
joe.writeRecord()
end
print('Destroying mutex')
mutex:free()
print('Closing files')
for _, fd in ipairs(fds) do
fd:close()
end
print('Synchronizing and terminating the threads')
block:synchronize()
block:terminate()
end
-- Thread initialization callback
function joe.initThread()
return function ()
local torch = require('torch')
local Queue = require('queue')
end
end
-- Thread job deploying threads
function joe.deployThreads(
data, word_list, num_grams, queue, mutex, block, num_threads)
local progress = torch.LongTensor(2)
progress[1] = 1
progress[2] = 0
for i = 1, num_threads do
print('Deploying job for thread '..i)
local thread_job = joe.threadJob(
data, word_list, num_grams, queue, mutex:id(), progress, i)
block:addjob(i, thread_job)
local rpc = queue:pop()
while rpc.func ~= 'notifyDeploy' do
joe[rpc.func](unpack(rpc.arg))
rpc = queue:pop()
end
print('rpc = notifyDeploy, thread = '..rpc.arg[1])
end
end
-- Write records to file
function joe.writeRecord()
for code, item in pairs(joe.record) do
local chunk = hash.hash(code, joe.SEED, #joe.fds) + 1
joe.fds[chunk]:write(
'"', code, '","', item[1]:gsub('\n', '\\n'):gsub('"', '""'), '","',
item[2], '","', item[3], '"\n')
end
joe.record = tds.Hash()
collectgarbage()
end
-- Thread job
function joe.threadJob(
data, word_list, num_grams, queue, mutex_id, progress, thread_id)
local utf8str = joe.utf8str()
return function()
local math = require('math')
local string = require('string')
local threads = require('threads')
local mutex = threads.Mutex(mutex_id)
-- Notify the deployment
queue:push{func = 'notifyDeploy', arg = {__threadid}}
local code, code_value = data.code, data.code_value
local class, item
-- Obtain next sample
local function nextSample()
mutex:lock()
if code[progress[1]] == nil then
class = progress[1]
item = progress[2]
elseif code[progress[1]]:size(1) < progress[2] + 1 then
progress[1] = progress[1] + 1
progress[2] = 1
class = progress[1]
item = progress[2]
else
progress[2] = progress[2] + 1
class = progress[1]
item = progress[2]
end
mutex:unlock()
end
local n = 0
nextSample()
while code[class] ~= nil do
n = n + 1
if math.fmod(n, 100) == 0 then
queue:push{
func = 'print',
arg = {__threadid,
'Processing class '..class..', item '..item..
', total '..n}}
collectgarbage()
end
local term_count, doc_count = {}, {}
-- Iterate through the fields
for i = 1, code[class][item]:size(1) do
-- Iterate through the grams
for j = 1, num_grams do
-- Iterate through the positions
for k = 1, code[class][item][i][2] - j + 1 do
local code_string = tostring(
code_value[code[class][item][i][1] + k - 1])
for l = 2, j do
code_string = code_string..' '..tostring(
code_value[code[class][item][i][1] + k - 1 + l - 1])
end
if not term_count[code_string] then
term_count[code_string] = 1
doc_count[code_string] = 1
else
term_count[code_string] = term_count[code_string] + 1
end
end
end
end
-- Compress record to data
local items = {}
for code_string, _ in pairs(term_count) do
local gram_string = ''
for value in code_string:gmatch('[%S]+') do
local value = tonumber(value)
gram_string = gram_string..' '..(word_list[value] or '')
end
items[#items + 1] = {
code_string, gram_string, term_count[code_string],
doc_count[code_string]}
end
-- Send data to record
queue:push{func = 'recordItem', arg = {__threadid, items}}
nextSample()
end
-- Notify main thread that this thread has ended
queue:push{func = 'notifyExit', arg = {__threadid}}
end
end
-- Record item
function joe.recordItem(thread_id, items)
for _, item in pairs(items) do
if joe.record[item[1]] then
joe.record[item[1]][2] = joe.record[item[1]][2] + item[3]
joe.record[item[1]][3] = joe.record[item[1]][3] + item[4]
else
joe.record[item[1]] = tds.Vec{item[2], item[3], item[4]}
end
end
joe.count = joe.count + 1
-- Check write
if math.fmod(joe.count, joe.batch) == 0 then
print('Writing records to files at '..joe.count)
joe.writeRecord()
end
end
-- Print information
function joe.print(thread_id, message)
print('rpc = print, thread = '..thread_id..', message = '..message)
end
-- Notify exit
function joe.notifyExit(thread_id)
joe.finished = joe.finished + 1
print('rpc = notifyExit, thread = '..thread_id..
', finished = '..joe.finished)
end
-- UTF-8 encoding function
-- Ref: http://stackoverflow.com/questions/7983574/how-to-write-a-unicode-symbol
-- -in-lua
function joe.utf8str()
local bytemarkers = {{0x7FF, 192}, {0xFFFF, 224}, {0x1FFFFF, 240}}
return function (decimal)
local string = require('string')
if decimal < 128 then return string.char(decimal) end
local charbytes = {}
for bytes,vals in ipairs(bytemarkers) do
if decimal <= vals[1] then
for b = bytes + 1, 2, -1 do
local mod = decimal % 64
decimal = (decimal - mod) / 64
charbytes[b] = string.char(128+mod)
end
charbytes[1] = string.char(vals[2] + decimal)
break
end
end
return table.concat(charbytes)
end
end
function joe.readList(list)
local freq = {}
local word_list = tds.Hash()
local fd = io.open(list)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[2] = content[2]:gsub('\\n', '\n')
freq[#freq + 1] = tonumber(content[3])
word_list[#freq] = content[1]:gsub('\\n', '\n')
end
return torch.Tensor(freq), word_list
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/limit_code.lua
================================================
--[[
Limit the maximum code value
Copyright 2016 Xiang Zhang
Usage: th limit_code.lua [input] [output] [limit]
--]]
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_word.t7b'
local output = arg[2] or '../data/dianping/train_word_limit.t7b'
local limit = arg[3] and tonumber(arg[3]) or 200000
print('Loading data from '..input)
local data = torch.load(input)
print('Limiting code to '..limit)
local code = joe.limitCode(data, limit)
print('Saving to '..output)
torch.save(output, code)
end
function joe.limitCode(data, limit)
local code, code_value = data.code, data.code_value
local preserve = code_value:le(limit):long()
local replace = code_value:gt(limit):long()
code_value:cmul(preserve):add(replace:mul(limit + 1))
return {code = code, code_value = code_value}
end
joe.main()
return joe
================================================
FILE: data/dianping/limit_csvlines.sh
================================================
#!/bin/bash
# Limit csv files to designated number of lines
# Copyright 2015 Xiang Zhang
#
# Usage: bash limit_csvlines.sh [input] [output] [limit]
set -x;
set -e;
head -n ${3:-1000001} $1 > $2;
================================================
FILE: data/dianping/queue.lua
================================================
--[[
Multithreaded queue based on tds
Copyright 2015 Xiang Zhang
--]]
local class = require('pl.class')
local ffi = require('ffi')
local serialize = require('threads.sharedserialize')
local tds = require('tds')
local threads = require('threads')
local torch = require('torch')
-- Append an underscore to distinguish between metatable and class name
local Queue_ = torch.class('Queue')
-- Constructor
-- n: buffer size
function Queue_:__init(size)
self.data = tds.hash()
self.pointer = torch.LongTensor(3):fill(1)
self.pointer[3] = 0
self.size = size or 10
self.mutex = threads.Mutex()
self.added_condition = threads.Condition()
self.removed_condition = threads.Condition()
end
function Queue_:push(item)
local storage = serialize.save(item)
self.mutex:lock()
while self.pointer[3] == self.size do
self.removed_condition:wait(self.mutex)
end
self.data[self.pointer[1]] = storage:string()
self.pointer[1] = math.fmod(self.pointer[1], self.size) + 1
self.pointer[3] = self.pointer[3] + 1
self.mutex:unlock()
self.added_condition:signal()
end
function Queue_:pop()
self.mutex:lock()
while self.pointer[3] == 0 do
self.added_condition:wait(self.mutex)
end
local storage = torch.CharStorage():string(self.data[self.pointer[2]])
self.pointer[2] = math.fmod(self.pointer[2], self.size) + 1
self.pointer[3] = self.pointer[3] - 1
self.mutex:unlock()
self.removed_condition:signal()
local item = serialize.load(storage)
return item
end
function Queue_:push_async(item)
if self.pointer[3] == self.size then
return
end
local storage = serialize.save(item)
self.mutex:lock()
if self.pointer[3] == self.size then
self.mutex:unlock()
return
end
self.data[self.pointer[1]] = storage:string()
self.pointer[1] = math.fmod(self.pointer[1], self.size) + 1
self.pointer[3] = self.pointer[3] + 1
self.mutex:unlock()
self.added_condition:signal()
return item
end
function Queue_:pop_async()
if self.pointer[3] == 0 then
return
end
self.mutex:lock()
if self.pointer[3] == 0 then
self.mutex:unlock()
return
end
local storage = torch.CharStorage():string(self.data[self.pointer[2]])
self.pointer[2] = math.fmod(self.pointer[2], self.size) + 1
self.pointer[3] = self.pointer[3] - 1
self.mutex:unlock()
self.removed_condition:signal()
local item = serialize.load(storage)
return item
end
function Queue_:free()
self.mutex:free()
self.added_condition:free()
self.removed_condition:free()
end
function Queue_:__write(f)
local data = self.data
f:writeLong(torch.pointer(data))
tds.C.tds_hash_retain(data)
local pointer = self.pointer
f:writeLong(torch.pointer(pointer))
pointer:retain()
f:writeObject(self.size)
f:writeObject(self.mutex:id())
f:writeObject(self.added_condition:id())
f:writeObject(self.removed_condition:id())
end
function Queue_:__read(f)
local data = f:readLong()
data = ffi.cast('tds_hash&', data)
ffi.gc(data, tds.C.tds_hash_free)
self.data = data
local pointer = f:readLong()
pointer = torch.pushudata(pointer, 'torch.LongTensor')
self.pointer = pointer
self.size = f:readObject()
self.mutex = threads.Mutex(f:readObject())
self.added_condition = threads.Condition(f:readObject())
self.removed_condition = threads.Condition(f:readObject())
end
-- Return class name, not the underscored metatable
return Queue
================================================
FILE: data/dianping/remove_duplication.py
================================================
#!/usr/bin/python3
'''
Remove duplication from csv format file
Copyright 2015 Xiang Zhang
Usage: python3 remove_duplication.py -i [input] -o [output]
'''
# Python 3 compatibility
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
# Input file
INPUT = '../data/dianping/reviews_nonull.csv'
# Output file
OUTPUT = '../data/dianping/reviews_nodup.csv'
import argparse
import csv
# Main program
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
removeDuplicate()
# Deduplicate the text using python set
def removeDuplicate():
# Open the files
ifd = open(INPUT, newline = '', encoding = 'utf-8')
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
valid = 0
s = set()
for row in reader:
line = ' '.join(row[1:])
n = n + 1
if line not in s:
valid = valid + 1
s.add(line)
writer.writerow(row)
if n % 10000 == 0:
print('\rProcessing line: {}, valid: {}'.format(n, valid), end = '')
print('\rProcessed lines: {}, valid: {}'.format(n, valid))
if __name__ == '__main__':
main()
================================================
FILE: data/dianping/remove_null.sh
================================================
#!/bin/bash
# Remove NULL character from file
# Copyright 2015 Xiang Zhang
#
# Usage: bash remove_null.sh [input] [output]
set -x;
set -e;
tr -d '\000' < $1 > $2;
================================================
FILE: data/dianping/segment_roman_word.lua
================================================
--[[
Create romananized word data from romanized data in csv
Copyright 2016 Xiang Zhang
Usage: th segment_roman_word.lua [input] [output] [list] [read]
--]]
local ffi = require('ffi')
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/dianping/train_pinyin.csv'
local output = arg[2] or '../data/dianping/train_pinyin_word.csv'
local list = arg[3] or '../data/dianping/train_pinyin_word_list.csv'
local read = (arg[4] == 'true')
local word_index, word_total
if read then
print('Reading word index')
word_index, word_total = joe.readWords(list)
else
print('Counting words')
local word_count, word_freq = joe.splitWords(input)
print('Sorting words by count')
word_index, word_total = joe.sortWords(list, word_count, word_freq)
end
print('Constructing word index output')
joe.constructWords(input, output, word_index, word_total)
end
function joe.readWords(list)
local word_index = tds.Hash()
local fd = io.open(list)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: '..n)
io.flush()
end
local content = joe.parseCSVLine(line)
content[1] = content[1]:gsub('\\n', '\n')
word_index[content[1]] = n
end
print('\rProcessed lines: '..n)
fd:close()
return word_index, n
end
function joe.splitWords(input)
local word_count, word_freq = tds.Hash(), tds.Hash()
local fd = io.open(input)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
field_set = {}
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
content[i] = content[i]:gsub('(%p)', ' %1 ')
for word in content[i]:gmatch('[%S]+') do
word_count[word] = (word_count[word] or 0) + 1
if not field_set[word] then
field_set[word] = true
word_freq[word] = (word_freq[word] or 0) + 1
end
end
end
end
print('\rProcessed lines: '..n)
fd:close()
-- Normalizing word frequencies
for key, value in pairs(word_freq) do
word_freq[key] = value / n
end
return word_count, word_freq
end
function joe.sortWords(list, word_count, word_freq)
-- Sort the list of words
local word_list = tds.Vec()
for word, _ in pairs(word_count) do
word_list[#word_list + 1] = word
end
word_list:sort(function (w, v) return word_count[w] > word_count[v] end)
-- Create the word index
local word_index = tds.Hash()
for index, word in ipairs(word_list) do
word_index[word] = index
end
-- Write it to file
local fd = io.open(list, 'w')
for index, word in ipairs(word_list) do
fd:write('"', word:gsub("\n", "\\n"):gsub("\"", "\"\""), '","',
word_count[word], '","', word_freq[word], '"\n')
end
fd:close()
return word_index, #word_list
end
function joe.constructWords(input, output, word_index, word_total)
local ifd = io.open(input)
local ofd = io.open(output, 'w')
local n = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
ofd:write('"', content[1], '"')
for i = 2, #content do
content[i] = content[i]:gsub('\\n', '\n'):gsub("^%s*(.-)%s*$", "%1")
content[i] = content[i]:gsub('(%p)', ' %1 ')
local first_write = true
ofd:write(',"')
for word in content[i]:gmatch('[%S]+') do
local index = word_index[word] or word_total + 1
if first_write then
first_write = false
ofd:write(index)
else
ofd:write(' ', index)
end
end
ofd:write('"')
end
ofd:write('\n')
end
print('\rProcessed lines: '..n)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/segment_word.py
================================================
#!/usr/bin/python3
'''
Convert Chinese datasets to Index of Words
Copyright 2016 Xiang Zhang
Usage: python3 segment_word.py -i [input] -l [list] -o [output] [-r]
'''
#Input file
INPUT = '../data/dianping/train.csv'
#Output file
OUTPUT = '../data/dianping/train_word.csv'
# List file
LIST = '../data/dianping/train_word_list.csv'
# Read already defined word list
READ = False
import argparse
import csv
import jieba
# Main program
def main():
global INPUT
global OUTPUT
global LIST
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
parser.add_argument('-l', '--list', help = 'Word list file', default = LIST)
parser.add_argument(
'-r', '--read', help = 'Read from list file', action = 'store_true')
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
LIST = args.list
READ = args.read
if READ:
print('Reading word index')
word_index = readWords()
else:
print('Counting words')
word_count, word_freq = segmentWords()
print('Sorting words by count')
word_index = sortWords(word_count, word_freq)
print('Constructing word index output')
convertWords(word_index)
# Read from pre-existing word list
def readWords():
# Open the files
ifd = open(LIST, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_index = dict()
n = 0
for row in reader:
word = row[0].replace('\\n', '\n')
word_index[word] = n + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
return word_index
# Segment the text in Chinese
def segmentWords():
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_count = dict()
word_freq = dict()
n = 0
for row in reader:
field_set = set()
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = jieba.cut(field)
for word in field_list:
word_count[word] = word_count.get(word, 0) + 1
if word not in field_set:
field_set.add(word)
word_freq[word] = word_freq.get(word, 0) + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
# Normalizing word frequency
for word in word_freq:
word_freq[word] = float(word_freq[word]) / float(n)
return word_count, word_freq
# Sort words for a given count dictionary object
def sortWords(word_count, word_freq):
# Sort the words
word_list = sorted(
word_count, key = lambda word: word_count[word], reverse = True)
# Open the files
ofd = open(LIST, 'w', encoding = 'utf-8', newline = '')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over all the words
word_index = dict()
n = 0
for i in range(len(word_list)):
word = word_list[i]
row = [word.replace('\n', '\\n'), str(word_count[word]),
str(word_freq[word])]
writer.writerow(row)
word_index[word] = i + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing word: {}'.format(n), end = '')
print('\rProcessed words: {}'.format(n))
ofd.close()
return word_index
# Convert the text in Chinese to word list
def convertWords(word_index):
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = jieba.cut(field)
new_row.append(' '.join(map(
str, map(lambda word: word_index.get(word, len(word_index) + 1),
field_list))))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/dianping/select_data.lua
================================================
--[[
Select data from non-duplicate datasets
Copyright 2015 Xiang Zhang
Usage: th select_data.lua [count] [input] [output]
--]]
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local count = arg[1] or '../data/dianping/reviews_count.csv'
local input = arg[2] or '../data/dianping/reviews_nodup.csv'
local output = arg[3] or '../data/dianping/data.csv'
local map = {}
local index = {}
local cfd = io.open(count)
for line in cfd:lines() do
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
local target = tonumber(content[2])
local total = tonumber(content[3])
local choose = tonumber(content[4])
print('Constructing index '..class..'>'..target..': '..choose..'/'..total)
map[class] = target
index[class] = torch.ByteTensor(total):fill(1)
local perm = torch.randperm(total)
for i = 1, total - choose do
index[class][perm[i]] = 0
end
end
cfd:close()
local n = 0
local progress = {}
local ifd = io.open(input)
local ofd = io.open(output, 'w')
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
local target = map[class]
progress[class] = progress[class] and progress[class] + 1 or 1
if index[class] and index[class][progress[class]] == 1 then
ofd:write(
'"', target, '"', (line:sub(content[1]:len() + 3) or ''), '\n')
end
end
print('\rProcessed lines: '..n)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine (line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/dianping/shuffle_lines.sh
================================================
#!/bin/bash
# Shuffle lines in a text file
# Copyright 2017 Xiang Zhang
#
# Usage: bash shuffle_lines.sh [input] [output]
set -x;
set -e;
shuf $1 > $2;
================================================
FILE: data/dianping/sort_gram_count.sh
================================================
#!/bin/bash
# Sort distributed grams file
# Copyright 2016 Xiang Zhang
#
# Usage: bash sort_gram_count.sh [input_directory] [output_directory] [temporary] [memory]
set -x;
set -e;
for file in $1/*.csv; do
sort -S ${4:-50%} -t ',' -k1,1 -T ${3:-/scratch} $file > $2/`basename $file`
done;
================================================
FILE: data/dianping/sort_gram_list.sh
================================================
#!/bin/bash
# Sort list of grams and cut the count
# Copyright 2016 Xiang Zhang
#
# Usage: bash sort_gram_list.sh [input] [output] [temporary] [memory]
set -x;
set -e;
sort -S ${4:-50%} -t ',' -k1,1nr -T ${3:-/scratch} $1 | cut -f 2- -d ',' > $2;
================================================
FILE: data/dianping/split_lines.sh
================================================
#!/bin/bash
# Split lines in a text file
# Copyright 2017 Xiang Zhang
#
# Usage: bash split_lines.sh [lines] [input] [output_prefix]
#
# Note: .txt postfix will be automatically added.
set -x;
set -e;
split -d -a 1 --additional-suffix=.txt -l $1 $2 $3;
================================================
FILE: data/dianping/split_train.lua
================================================
--[[
Split data into training and testing subsets
Copyright 2015 Xiang Zhang
Usage: th split_train [count] [input] [train] [test]
--]]
local io = require('io')
local math = require('math')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local count = arg[1] or '../data/dianping/data_count.csv'
local input = arg[2] or '../data/dianping/data.csv'
local train = arg[3] or '../data/dianping/train.csv'
local test = arg[4] or '../data/dianping/test.csv'
local index = {}
local cfd = io.open(count)
for line in cfd:lines() do
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
local total = tonumber(content[2])
local train_count = tonumber(content[3])
local test_count = tonumber(content[4])
print('Constructing index '..class..': '..train_count..
','..test_count..','..total)
index[class] = torch.ByteTensor(total):zero()
local perm = torch.randperm(total)
for i = 1, test_count do
index[class][perm[i]] = 1
end
end
cfd:close()
local n = 0
local progress = {}
local ifd = io.open(input)
local trfd = io.open(train, 'w')
local tefd = io.open(test, 'w')
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
progress[class] = progress[class] and progress[class] + 1 or 1
if index[class] and index[class][progress[class]] == 0 then
trfd:write(line, '\n')
else
tefd:write(line, '\n')
end
end
print('\rProcessed lines: '..n)
ifd:close()
trfd:close()
tefd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine (line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/ifeng/construct_topic.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of news articles
Copyright 2016 Xiang Zhang
Usage: python3 construct_topic.py -i [input directory] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/ifeng/article'
OUTPUT = '../data/ifeng/topic/news.csv'
# Classes
# 1: Mainlaind China Politics
# 2: International
# 3: Taiwan, Hong Kong and Macau Politics
# 4: Military
# 5: Society
CLASSES = {'11528': 1, '11574': 2, '11490': 3, '7609': 3, '4550': 4, '7837': 5}
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file pattern', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
createData()
def createData():
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
for prefix in CLASSES:
files = glob.glob(INPUT + '/' + prefix + '_*.json.xz')
index = CLASSES[prefix]
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
news = json.loads(line)
title = news.get('title', '')
content = news.get('content', list())
abstract = ''
if len(content) > 0:
abstract = content[0]
n = n + 1
writer.writerow([index, title.replace('\n', '\\n'),
abstract.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/jd/count_data.lua
================================================
--[[
Count data for each class and length
Copyright 2016 Xiang Zhang
Usage: th count_data.lua [input] [output]
--]]
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/jd/sentiment/comment_sorted_nonull.csv'
local output = arg[2] or '../data/jd/sentiment/comment_sorted_count.t7b'
print('Counting data')
local count = joe.count(input)
joe.count = count
print('Saving to '..output)
torch.save(output, count)
print('Plotting result')
joe.plot(count)
end
function joe.count(input)
local count = {}
local max_class = 0
local max_length = 0
local fd = io.open(input)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
local length = 0
for i = 2, #content do
length = length + content[i]:gsub("^%s*(.-)%s*$", "%1"):len()
end
count[class] = count[class] or {}
count[class][length] = (count[class][length] or 0) + 1
if class > max_class then
max_class = class
end
if length > max_length then
max_length = length
end
end
print('\rProcessed lines: '..n)
print('total classes = '..max_class..', maximum length = '..max_length)
fd:close()
local result = torch.Tensor(max_class, max_length):zero()
for class, class_count in pairs(count) do
if class > 0 then
for length, length_count in pairs(class_count) do
if length > 0 then
result[class][length] = length_count
end
end
end
end
return result
end
function joe.plot(count)
require('gnuplot')
local cumulated = count:cumsum(2)
local plots = {}
for class = 1, cumulated:size(1) do
plots[class] = {tostring(class), cumulated[class], '-'}
end
local figure = gnuplot.figure()
gnuplot.plot(unpack(plots))
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine (line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/jd/create_comment.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of comments
Copyright 2016 Xiang Zhang
Usage: python3 create_data.py -i [input file pattern] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/jd/comment/*.json.xz'
OUTPUT = '../data/jd/sentiment/comment.csv'
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file pattern', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
createData()
def createData():
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
files = glob.glob(INPUT)
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
review = json.loads(line)
score = int(review['content'].get('score', -1))
title = review['content'].get('title', '')
content = review['content'].get('content', '')
if score != -1:
n = n + 1
writer.writerow([score, title.replace('\n', '\\n'),
content.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/jd/limit_length.lua
================================================
--[[
Limit length for data
Copyright 2016 Xiang Zhang
Usage: th limit_length.lua [input] [output] [min] [max]
--]]
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/jd/sentiment/comment_sorted_nonull.csv'
local output = arg[2] or '../data/jd/sentiment/comment_sorted_limited.csv'
local min = tonumber(arg[3] or 0)
local max = tonumber(arg[4] or math.huge)
print('Limiting data')
joe.limit(input, output, min, max)
end
function joe.limit(input, output, min, max)
local ifd = io.open(input)
local ofd = io.open(output, 'w')
local n = 0
local m = 0
for line in ifd:lines() do
n = n + 1
local content = joe.parseCSVLine(line)
local length = 0
for i = 2, #content do
length = length + content[i]:gsub("^%s*(.-)%s*$", "%1"):len()
end
if length >= min and length <= max then
m = m + 1
ofd:write(line, '\n')
end
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line: ', n, ', Saved lines: ', m)
io.flush()
end
end
print('\rProcessed lines: '..n..', Saved lines: '..m)
ifd:close()
ofd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine (line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/jd/sort_data.sh
================================================
#!/bin/bash
# Sort comma-separated file starting from the second field
# Copyright 2016 Xiang Zhang
#
# Usage: bash sort_data.sh [input_file] [output_file] [temporary] [memory]
set -x;
set -e;
sort -S ${4:-50%} -t ',' -k2 -u -T ${3:-/scratch} $1 > $2;
================================================
FILE: data/joint/combine_word.lua
================================================
--[[
Combine two word data together
Copyright 2016 Xiang Zhang
Usage: th combine_word_list.lua [input_1] [list_1] [input_2] [list_2] ...
[output] [list]
--]]
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = {}
local input_list = {}
for i = 1, math.floor(#arg / 2) - 1 do
input[i] = arg[2 * i - 1]
input_list[i] = arg[2 * i]
end
local output = arg[math.floor(#arg / 2) * 2 - 1] or
'../data/joint/binary_train_word.csv'
local output_list = arg[math.floor(#arg / 2) * 2] or
'../data/joint/binary_train_word_list.csv'
print('Loading output list from '..output_list)
local list, count, freq, dict = joe.readList(output_list)
print('Opening output file '..output)
local ofd = io.open(output, 'w')
for i = 1, #input do
print('Loading input list from '..input_list[i])
local local_list, local_count, local_freq, local_dict =
joe.readList(input_list[i])
print('Building input to output map')
local map = joe.buildMap(local_list, dict)
print('Processing data from '..input[i])
joe.processInput(input[i], map, ofd, list)
end
print('Closing output file '..output)
ofd:close()
end
function joe.readList(file)
local list = tds.Vec()
local count = tds.Vec()
local freq = tds.Vec()
local dict = tds.Hash()
local fd = io.open(file)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[1] = content[1]:gsub('\\n', '\n')
list:insert(content[1])
count:insert(tonumber(content[2]))
freq:insert(tonumber(content[3]))
dict[content[1]] = #list
end
fd:close()
return list, count, freq, dict
end
function joe.buildMap(input_list, dict)
local map = tds.Vec()
for i = 1, #input_list do
map[i] = dict[input_list[i]]
end
return map
end
function joe.processInput(input, map, ofd, list)
local ifd = io.open(input)
local n = 0
for line in ifd:lines() do
n = n + 1
if math.fmod(n, 10000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
-- Write class
local content = joe.parseCSVLine(line)
ofd:write('"', content[1], '"')
-- Write title and comment
for i = 2, #content do
ofd:write(',"')
for word in content[i]:gmatch('%d+') do
ofd:write(map[tonumber(word)] or #list + 1, ' ')
end
ofd:write('"')
end
-- Write end of line
ofd:write('\n')
end
print('\rProcessed lines: '..n)
ifd:close()
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/joint/combine_word_list.lua
================================================
--[[
Combine two word data together
Copyright 2016 Xiang Zhang
Usage: th combine_word_list.lua [list_1] [size_1] [list_2] [size_2] ... [output]
--]]
local io = require('io')
local math = require('math')
local tds = require('tds')
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input_list = {}
local input_size = {}
for i = 1, math.floor(#arg / 2) do
input_list[i] = arg[2 * i - 1]
input_size[i] = arg[2 * i]
end
local output_list = arg[math.floor(#arg / 2) * 2 + 1] or
'../data/joint/binary_train_word_list.csv'
local word = {}
for i = 1, #input_list do
print('Loading list from '..input_list[i])
local list, count, freq, dict = joe.readInputList(input_list[i])
word[i] = {list = list, count = count, freq = freq, dict = dict}
end
print('Merging word lists')
local list, count_table, freq_table, dict =
joe.mergeWords(word, input_size)
print('Writing merged word list to '..output_list)
joe.writeOutputList(output_list, list, count_table, freq_table, dict)
end
function joe.readInputList(file)
local list = tds.Vec()
local count = tds.Vec()
local freq = tds.Vec()
local dict = tds.Hash()
local fd = io.open(file)
for line in fd:lines() do
local content = joe.parseCSVLine(line)
content[1] = content[1]:gsub('\\n', '\n')
list:insert(content[1])
count:insert(tonumber(content[2]))
freq:insert(tonumber(content[3]))
dict[content[1]] = #list
end
fd:close()
return list, count, freq, dict
end
function joe.writeOutputList(file, list, count_table, freq_table, dict)
local fd = io.open(file, 'w')
for index, word in ipairs(list) do
fd:write('"', word:gsub('\n', '\\n'):gsub('"', '""'), '","',
count_table[word], '","', freq_table[word], '"\n')
end
fd:close()
end
function joe.mergeWords(word, size)
local total_size = 0
for i, s in ipairs(size) do
total_size = total_size + s
end
local list = tds.Vec()
local count_table = tds.Hash()
local freq_table = tds.Hash()
for i, w in ipairs(word) do
for j, v in ipairs(w.list) do
if count_table[v] == nil then
list:insert(v)
count_table[v] = w.count[j]
freq_table[v] = w.freq[j] * size[i] / total_size
else
count_table[v] = count_table[v] + w.count[j]
freq_table[v] = freq_table[v] + w.freq[j] * size[i] / total_size
end
if math.fmod(j, 100000) == 0 then
io.write('\rProcessing list ', i, ': ', j, '/', #w.list)
io.flush()
end
end
print('\rProcessed list '..i..': '..(#w.list)..'/'..(#w.list))
end
print('Sorting merged word list')
list:sort(function(a, b) return count_table[a] > count_table[b] end)
print('Constructing merged word dictionary')
local dict = tds.Hash()
for i, w in ipairs(list) do
dict[w] = i
end
return list, count_table, freq_table, dict
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine(line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/nytimes/construct_topic.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of news articles
Copyright 2016 Xiang Zhang
Usage: python3 construct_topic.py -i [input directory] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
import re
import urllib.parse
INPUT = '../data/nytimes/article'
OUTPUT = '../data/nytimes/topic/news.csv'
CLASS = '../data/nytimes/topic/class.csv'
def main():
global INPUT
global OUTPUT
global CLASS
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file directory', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
parser.add_argument(
'-c', '--classes', help = 'Class file', default = CLASS)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
CLASS = args.classes
createData()
def createData():
# Open the category file
classes = dict()
count = 0
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
files = glob.glob(INPUT + '/*.json.xz')
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
news = json.loads(line)
title = news.get('title', '')
content = news.get('content', list())
abstract = ''
if len(content) > 0:
abstract = content[0]
url = news.get('url', '')
if url != '':
path = urllib.parse.urlparse(url).path
start_match = re.match(r'/\d\d\d\d/\d\d/\d\d/', path)
end_match = re.match(r'/\d\d\d\d/\d\d/\d\d/[^/]+', path)
if start_match != None and end_match != None:
classname = path[start_match.end():end_match.end()]
if classes.get(classname, None) == None:
classes[classname] = count + 1
count = count + 1
index = classes[classname]
writer.writerow([index, title.replace('\n', '\\n'),
abstract.replace('\n', '\\n')])
n = n + 1
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
# Open the class file
cfd = open(CLASS, 'w', newline = '', encoding = 'utf-8')
class_writer = csv.writer(
cfd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
for key in classes:
class_writer.writerow([classes[key], key])
cfd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/nytimes/count_class.lua
================================================
--[[
Count data for each class and length
Copyright 2016 Xiang Zhang
Usage: th count_data.lua [input] [output]
--]]
local torch = require('torch')
-- A Logic Named Joe
local joe = {}
function joe.main()
local input = arg[1] or '../data/nytimes/topic/news_sorted.csv'
local output = arg[2] or '../data/nytimes/topic/news_sorted_class.t7b'
print('Counting data')
local count = joe.count(input)
joe.count = count
print('Saving to '..output)
torch.save(output, count)
end
function joe.count(input)
local count = {}
local fd = io.open(input)
local n = 0
for line in fd:lines() do
n = n + 1
if math.fmod(n, 100000) == 0 then
io.write('\rProcessing line: ', n)
io.flush()
end
local content = joe.parseCSVLine(line)
local class = tonumber(content[1])
local length = 0
for i = 2, #content do
length = length + content[i]:gsub("^%s*(.-)%s*$", "%1"):len()
end
count[class] = (count[class] or 0) + 1
end
print('\rProcessed lines: '..n)
fd:close()
return count
end
-- Parsing csv line
-- Ref: http://lua-users.org/wiki/LuaCsv
function joe.parseCSVLine (line,sep)
local res = {}
local pos = 1
sep = sep or ','
while true do
local c = string.sub(line,pos,pos)
if (c == "") then break end
if (c == '"') then
-- quoted value (ignore separator within)
local txt = ""
repeat
local startp,endp = string.find(line,'^%b""',pos)
txt = txt..string.sub(line,startp+1,endp-1)
pos = endp + 1
c = string.sub(line,pos,pos)
if (c == '"') then txt = txt..'"' end
-- check first char AFTER quoted string, if it is another
-- quoted string without separator, then append it
-- this is the way to "escape" the quote char in a quote.
until (c ~= '"')
table.insert(res,txt)
assert(c == sep or c == "")
pos = pos + 1
else
-- no quotes used, just look for the first separator
local startp,endp = string.find(line,sep,pos)
if (startp) then
table.insert(res,string.sub(line,pos,startp-1))
pos = endp + 1
else
-- no separator found -> use rest of string and terminate
table.insert(res,string.sub(line,pos))
break
end
end
end
return res
end
joe.main()
return joe
================================================
FILE: data/rakuten/construct_hepburn.py
================================================
#!/usr/bin/python3
'''
Convert Japanese datasets to Hepburn Romanization
Copyright 2016 Xiang Zhang
Usage: python3 construct_hepburn.py -i [input] -o [output]
'''
# Input file
INPUT = '../data/rakuten/sentiment/full_train.csv'
# Output file
OUTPUT = '../data/rakuten/sentiment/full_train_hepburn.csv'
import argparse
import csv
import MeCab
import romkan
import unidecode
# Main program
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
mecab = MeCab.Tagger()
convertRoman(mecab)
def romanizeText(mecab, text):
parsed = mecab.parse(text)
result = list()
for token in parsed.split('\n'):
splitted = token.split('\t')
if len(splitted) == 2:
word = splitted[0]
features = splitted[1].split(',')
if len(features) > 7 and features[7] != '*':
result.append(romkan.to_hepburn(features[7]))
else:
result.append(word)
return result
# Convert the text in Chinese to pintin
def convertRoman(mecab):
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
new_row.append(' '.join(map(
str.strip,
map(lambda s: s.replace('\n', '\\n'),
map(unidecode.unidecode,
romanizeText(mecab, row[i]))))))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
if __name__ == '__main__':
main()
================================================
FILE: data/rakuten/create_review.py
================================================
#!/usr/bin/python3
'''
Create data from list of LZMA compressed archives of reviews
Copyright 2016 Xiang Zhang
Usage: python3 create_data.py -i [input file pattern] -o [output file]
'''
import argparse
import csv
import glob
import json
import lzma
INPUT = '../data/rakuten/review/*.json.xz'
OUTPUT = '../data/rakuten/sentiment/review.csv'
def main():
global INPUT
global OUTPUT
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input', help = 'Input file pattern', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
createData()
def createData():
# Open the output file
ofd = open(OUTPUT, 'w', newline = '', encoding = 'utf-8')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Grab the files
files = glob.glob(INPUT)
n = 0
filecount = 0
for filename in files:
filecount = filecount + 1
print('Processing file {}/{}: {}. Processed items {}.'.format(
filecount, len(files), filename, n))
try:
ifd = lzma.open(filename, 'rt', encoding = 'utf-8')
for line in ifd:
review = json.loads(line)
rate = review.get('rate', '')
title = review.get('title', '')
comment = review.get('comment', '')
if rate != '':
n = n + 1
writer.writerow([rate, title.replace('\n', '\\n'),
comment.replace('\n', '\\n')])
ifd.close()
except Exception as e:
print('Exception (ignored): {}'.format(e))
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: data/rakuten/segment_word.py
================================================
#!/usr/bin/python3
'''
Convert Japanese datasets to Index of Words
Copyright 2016 Xiang Zhang
Usage: python3 construct_pinyin.py -i [input] -l [list] -o [output] [-r]
'''
#Input file
INPUT = '../data/rakuten/sentiment/full_train.csv'
#Output file
OUTPUT = '../data/rakuten/sentiment/full_train_word.csv'
# List file
LIST = '../data/rakuten/sentiment/full_train_word_list.csv'
# Read already defined word list
READ = False
import argparse
import csv
import MeCab
# Main program
def main():
global INPUT
global OUTPUT
global LIST
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help = 'Input file', default = INPUT)
parser.add_argument(
'-o', '--output', help = 'Output file', default = OUTPUT)
parser.add_argument('-l', '--list', help = 'Word list file', default = LIST)
parser.add_argument(
'-r', '--read', help = 'Read from list file', action = 'store_true')
args = parser.parse_args()
INPUT = args.input
OUTPUT = args.output
LIST = args.list
READ = args.read
if READ:
print('Reading word index')
word_index = readWords()
else:
print('Counting words')
word_count, word_freq = segmentWords()
print('Sorting words by count')
word_index = sortWords(word_count, word_freq)
print('Constructing word index output')
convertWords(word_index)
# Read from pre-existing word list
def readWords():
# Open the files
ifd = open(LIST, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_index = dict()
n = 0
for row in reader:
word = row[0].replace('\\n', '\n')
word_index[word] = n + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
return word_index
# Segment the text in Chinese
def segmentWords():
mecab = MeCab.Tagger()
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
# Loop over the csv rows
word_count = dict()
word_freq = dict()
n = 0
for row in reader:
field_set = set()
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = list()
parsed_result = mecab.parse(field)
for token in parsed_result.split('\n'):
splitted_token = token.split('\t')
if len(splitted_token) == 2:
word = splitted_token[0]
field_list.append(word)
for word in field_list:
word_count[word] = word_count.get(word, 0) + 1
if word not in field_set:
field_set.add(word)
word_freq[word] = word_freq.get(word, 0) + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
# Normalizing word frequency
for word in word_freq:
word_freq[word] = float(word_freq[word]) / float(n)
return word_count, word_freq
# Sort words for a given count dictionary object
def sortWords(word_count, word_freq):
# Sort the words
word_list = sorted(
word_count, key = lambda word: word_count[word], reverse = True)
# Open the files
ofd = open(LIST, 'w', encoding = 'utf-8', newline = '')
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over all the words
word_index = dict()
n = 0
for i in range(len(word_list)):
word = word_list[i]
row = [word.replace('\n', '\\n'), str(word_count[word]),
str(word_freq[word])]
writer.writerow(row)
word_index[word] = i + 1
n = n + 1
if n % 1000 == 0:
print('\rProcessing word: {}'.format(n), end = '')
print('\rProcessed words: {}'.format(n))
ofd.close()
return word_index
# Convert the text in Chinese to word list
def convertWords(word_index):
mecab = MeCab.Tagger()
# Open the files
ifd = open(INPUT, encoding = 'utf-8', newline = '')
ofd = open(OUTPUT, 'w', encoding = 'utf-8', newline = '')
reader = csv.reader(ifd, quoting = csv.QUOTE_ALL)
writer = csv.writer(ofd, quoting = csv.QUOTE_ALL, lineterminator = '\n')
# Loop over the csv rows
n = 0
for row in reader:
new_row = list()
new_row.append(row[0])
for i in range(1, len(row)):
field = row[i].replace('\\n', '\n')
field_list = list()
parsed_result = mecab.parse(field)
for token in parsed_result.split('\n'):
splitted_token = token.split('\t')
if len(splitted_token) == 2:
word = splitted_token[0]
field_list.append(word)
new_row.append(' '.join(map(
str, map(lambda word: word_index.get(word, len(word_index) + 1),
field_list))))
writer.writerow(new_row)
n = n + 1
if n % 1000 == 0:
print('\rProcessing line: {}'.format(n), end = '')
print('\rProcessed lines: {}'.format(n))
ifd.close()
ofd.close()
if __name__ == '__main__':
main()
================================================
FILE: doc/dianping.md
================================================
# Dianping
This documentation contains information on how to reproduce all the results for the `Dianping` datasets in the paper.
The root directory `/` in this documentation indicates the root directory of this repository.
## Download the dataset
Original text data for training and testing are available via these two links: [`train.csv.xz`](https://goo.gl/uKPxyo) [`test.csv.xz`](https://goo.gl/2QZpLx). When you download them, make sure to put them in the `/data/data/dianping` directory and unxz so that you have `train.csv` and `test.csv` available.
## GlyphNet
This section introduces how to prepare and run GlyphNet experiments.
### Prepare GNU Unifont
Running the glyphnet training script requires the GNU Unifont character images. We have built these images into a Torch 7 binary serialization file and it can be download via this link: [`unifont-8.0.1.t7b.xz`](https://goo.gl/aFxYHq). After downloading, put it in `/unifont/unifont` directory and unxz so that you have `unifont-8.0.1.t7b` available.
### Build Byte Serialization Files
The next step is to build the serialized code files. The first step is to build the string serialization files. Switch to the `/data/dianping` directory, then execute the following commands
```bash
th construct_string.lua ../data/dianping/train.csv ../data/dianping/train_string.t7b
th construct_string.lua ../data/dianping/test.csv ../data/dianping/test_string.t7b
```
These 2 commands will build byte serialization files for the samples in its original language. It assumes the texts are contained in a comma-separated-value format in which the first field is treated as the class index (starting from 1), and the remaining fields are all texts.
The output files contain a lua table that has the following members
* `index`: a table that contains index tensors for each class. For example `index[i]` is an n x m x 2 `LongTensor` that contains the starting position and length of byte string representing each sample in class i. We assume that class i contains n samples, and there are m text fields in the CSV file.
* `content`: a `ByteTensor` that contains the serialization of the strings of all samples. Each string is ended with a 0 byte, which is not included in the length count in `index`.
### Build Unicode Serialization Files
From this byte-level serialization, we will be able to construct serialization files that contain unicode values to be used in the `glyphnet` training scripts. To do this, execute the following 2 commands
```bash
th construct_code.lua ../data/dianping/train_string.t7b ../data/dianping/train_code.t7b
th construct_code.lua ../data/dianping/test_string.t7b ../data/dianping/test_code.t7b
```
Each of these code files contain a lua table that has 2 `LongTensor` members: `code` and `code_value`. The have a similar structure as the `index` and `content` members of the byte serialization files, but in this case they are for unicode values.
### Execute the Experiments
Then, you can switch to `/glyphnet`, and execute the following scripts to run the training program for the large GlyphNet
```bash
mkdir -p models/dianping/spatial8temporal12length512feature256
./archive/dianping_spatial8temporal12length512feature256.sh
```
The first command simply creates a directory where checkpointing files will be written into during training. Note that the shell scripts also accepts command-line parameters and can pass it directly to the training program. The most useful ones are probably `-driver_visualize false` and `-driver_plot false`, that disable visualization and plotting so that you can run the training programs on a headless server. You can also use `-driver_resume true` to resume from checkpointed experiments. These parameters are available for all Torch 7 training programs.
Similarly, the following commands execute the experiment for the small GlyphNet
```bash
mkdir -p models/dianping/spatial6temporal8length486feature256
./archive/dianping_spatial6temporal8length486feature256.sh
```
## OnehotNet
This section details how to execute OnehotNet experiments. Note that OnehotNet in this article are operating at byte-level for either the original text or the romanized text. In the case of romanized text, it is the same as character-level.
### Byte-Level OnehotNet for Original Text
To train OnehotNet for the original text, we only need the previously built byte serialization files. If you do not have them, see previous sections for using `construct_string.lua` data processing scripts.
#### Execute the Experiments
Assuming your current working directory is `/onehotnet`, the following commands execute experiments for large OnehotNet on the original text samples.
```bash
mkdir -p models/dianping/onehot4temporal12length2048feature256
./archive/dianping_onehot4temporal12length2048feature256.sh
```
Similarly, the small OnehotNet experiments can be done using the following commands
```bash
mkdir -p models/dianping/onehot4temporal8length1944feature256
./archive/dianping/onehot4temporal8length19444feature256.sh
```
### Character-Level OnehotNet for Romanized Text
This section details how to execute OnehotNet for romanized text. But before that, we need to build the romanized data first.
#### Build Romanized Text Serialization Files
The first step is to convert the original text into a romanization format. This is done in this project automatically using the [`pypinyin`](https://github.com/mozillazg/python-pinyin) package (version 0.12 for the results in the paper). You also want to install [`jieba`](https://github.com/fxsjy/jieba) (version 0.38 for the results in the paper) so that `pypinyin` can use it for word segmentation. All these packages were installed in a Python 3 environment.
Switch the working directory to `/data/dianping`, the following commands converting the original text to a romanization format for the Dianping dataset.
```bash
python3 construct_pinyin.py -i ../data/dianping/train.csv -o ../data/dianping/train_pinyin.csv
python3 construct_pinyin.py -i ../data/dianping/test.csv -o ../data/dianping/test_pinyin.csv
```
Then, we can use `construct_string.lua` again for constructing the byte serialization of romanized texts.
```bash
th construct_string.lua ../data/dianping/train_pinyin.csv ../data/dianping/train_pinyin_string.t7b
th construct_string.lua ../data/dianping/test_pinyin.csv ../data/dianping/test_pinyin_string.t7b
```
#### Execute the Experiments
Assuming your current working directory is `/onehotnet`, the following commands execute experiments for large OnehotNet on the romanized text samples.
```bash
mkdir -p models/dianping/onehot4temporal12length2048feature256roman
./archive/dianping_onehot4temporal12length2048feature256roman.sh
```
Similarly, the small OnehotNet experiments can be done using the following commands
```bash
mkdir -p models/dianping/onehot4temporal8length1944feature256roman
./archive/dianping/onehot4temporal8length19444feature256roman.sh
```
## EmbedNet
This section introduces how to build the data files and executing experiments for EmbedNet.
### Character-Level EmbedNet for Original Text
Since we already built the serialization data files for unicode characters for GlyphNet, we can directly use them. The only step required is to run the commands for training the models.
Assuming the current working directory is `/embednet`, the following commands will start the training process for large character-level EmbedNet.
```bash
mkdir -p models/dianping/temporal12length512feature256
./archive/dianping_temporal12length512feature256.sh
```
And for small character-level EmbedNet
```bash
mkdir -p models/dianping/temporal8length486feature256
./archive/dianping_temporal8length486feature256.sh
```
### Byte-Level EmbedNet for Original Text
This section details how to train byte-level EmbedNet for the original text
#### Convert Byte Serialization Files
Since the EmbedNet training program assumes the data files contain a table of 2 members `code` and `code_value`, we need to change the variable names in the string serialization files to match this. This can be done in `/data/dianping` by executing the following commands
```bash
th convert_string_code.lua ../data/dianping/train_string.t7b ../data/dianping/train_string_code.t7b
th convert_string_code.lua ../data/dianping/test_string.t7b ../data/dianping/test_string_code.t7b
```
#### Execute the Experiments
Assuming the current working director is `/embednet`, the following commands start the training process for the large byte-level EmbedNet
```bash
mkdir -p models/dianping/temporal12length512feature256byte
./archive/dianping_temporal12length512feature256byte.sh
```
And for small byte-level EmbedNet
```bash
mkdir -p models/dianping/temporal8length486feature256byte
./archive/dianping_temporal8length486feature256byte.sh
```
### Character-Level EmbedNet for Romanized Text
Note that characters for romanized text is the same as bytes. Therefore, the steps are exactly the same as the byte-level EmbedNet, except for romanized text instead of original text.
#### Convert Byte Serialization Files
In `/data/dianping`, execute the following commands
```bash
th convert_string_code.lua ../data/dianping/train_pinyin_string.t7b ../data/dianping/train_pinyin_string_code.t7b
th convert_string_code.lua ../data/dianping/test_pinyin_string.t7b ../data/dianping/test_pinyin_string_code.t7b
```
#### Execute the Experiments
Assuming the current working director is `/embednet`, the following commands start the training process for the large character-level EmbedNet for romanized text
```bash
mkdir -p models/dianping/temporal12length512feature256roman
./archive/dianping_temporal12length512feature256roman.sh
```
And for small EmbedNet
```bash
mkdir -p models/dianping/temporal8length486feature256roman
./archive/dianping_temporal8length486feature256roman.sh
```
### Word-Level Embednet for Original Text
This section introduces how to segment word from the text, build the word serialization files, and execute the commands.
#### Build Word Serialization Files for Original Text
The first step for building the word serialization files is to segment the words. This is done by executing a Python 3 script as follows, assuming you have the [`jieba`](https://github.com/fxsjy/jieba) package installed (version 0.38 for the results in the paper) and the working directory is `/data/dianping`.
```bash
python3 segment_word.py -i ../data/dianping/train.csv -o ../data/dianping/train_word.csv -l ../data/dianping/train_word_list.csv
python3 segment_word.py -i ../data/dianping/test.csv -o ../data/dianping/test_word.csv -l ../data/dianping/train_word_list.csv -r
```
The first command generate 2 data files. `train_word.csv` is a file containing sequences of indices of segmented words from the original text fields, whereas `train_word_list.csv` contains the list of words. The second command read the same list of words generated from the training data (therefore the `-r` option) and use that list to build sequences for the testing data. This is done deliberately so that new words not in the training data are not considered for classification results.
The second step is to build the word serialization files from the segmentation results.
```bash
th construct_word.lua ../data/dianping/train_word.csv ../data/dianping/train_word.t7b
th construct_word.lua ../data/dianping/test_word.csv ../data/dianping/test_word.t7b
```
#### Execute the Experiments
When we have `train_word.t7b` and `test_word.t7b`, we can start executing the experiments for word-level EmbedNet models. Assume that the current directory is `/embednet`, the following commands start the training process for the large word-level EmbedNet for original text
```bash
mkdir -p models/dianping/temporal12length512feature256word
./archive/dianping_temporal12length512feature256word.sh
```
And for small EmbedNet
```bash
mkdir -p models/dianping/temporal8length486feature256word
./archive/dianping_temporal8length486feature256word.sh
```
### Word-Level EmbedNet for Romanized Text
Similar to the original text, romanized text also require word segmentation before being able to pass through the EmbedNet training program.
#### Build Word Serialization Files for Romanized Text
Word segmentation for romanized text is pretty simple. Assume you are in `/data/dianping`, the following commands do the job
```bash
th segment_roman_word.lua ../data/dianping/train_pinyin.csv ../data/dianping/train_pinyin_word.csv ../data/dianping/train_pinyin_word_list.csv
th segment_roman_word.lua ../data/dianping/test_pinyin.csv ../data/dianping/test_pinyin_word.csv ../data/dianping/train_pinyin_word_list.csv true
```
Note the additional `true` argument in the second command-line to inform the script to use the training word list for constructing the indices for the testing data.
Then, word serialization files can be built from the segmentation results using the following commands.
```bash
th construct_word.lua ../data/dianping/train_pinyin_word.csv ../data/dianping/train_pinyin_word.t7b
th construct_word.lua ../data/dianping/test_pinyin_word.csv ../data/dianping/test_pinyin_word.t7b
```
#### Execute the Experiments
When we have `train_pinyinword.t7b` and `test_pinyinword.t7b`, we can start executing the experiments for word-level EmbedNet models. Assume that the current directory is `/embednet`, the following commands start the training process for the large word-level EmbedNet for original text
```bash
mkdir -p models/dianping/temporal12length512feature256romanword
./archive/dianping_temporal12length512feature256romanword.sh
```
And for small EmbedNet
```bash
mkdir -p models/dianping/temporal8length486feature256romanword
./archive/dianping_temporal8length486feature256romanword.sh
```
## Linear Model
This section details how to reproduce the results for linear models.
### Character-Level 1-Gram Linear Model for Original Text
To run the linear model for using bag-of-character features, we need to build the feature serialization files first.
#### Build Character-Level 1-Gram Feature Serialization Files
To build the character-level 1-gram feature serialization files, execute the following commands from `/data/dianping`.
```bash
th construct_charbag.lua ../data/dianping/train_code.t7b ../data/dianping/train_charbag.t7b ../data/dianping/train_charbag_list.csv
th construct_charbag.lua ../data/dianping/test_code.t7b ../data/dianping/test_charbag.t7b ../data/dianping/train_charbag_list.csv true
```
The first command creates a file `train_charbag.t7b`, which contains a table that has the following members
* `bag`: a table where `bag[i]` contains a n-by-2 `LongTensor`. It contains the beginning index and length of values in `bag_index` and `bag_value` for each sample.
* `bag_index`: a 1-D `LongTensor` that contains the character indices of all samples.
* `bag_value`: a 1-D `DoubleTensor` that contains the frequency of the corresponding character indices.
The seond command creates the feature serialization file for testing data, but using the same character index that was created from training data. The additional `true` parameter means to read from list rather than create a new one.
All of the feature serialization files for linear models has the same data structure design.
To prepare feature serialization files for the TFIDF variant of bag-of-character linear model, execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_charbag.t7b ../data/dianping/train_charbagtfidf.t7b ../data/dianping/train_charbag_list.csv
th construct_tfidf.lua ../data/dianping/test_charbag.t7b ../data/dianping/test_charbagtfidf.t7b ../data/dianping/train_charbag_list.csv
```
Note that constructing serialization files for testing data still uses the character frequency list from training data.
#### Execute the Experiments
To execute the experiment for character-level 1-gram linear model, execute the following commands from `/linearnet`
```bash
mkdir -p models/dianping/charbag
./archive/dianping_charbag.sh
```
To execute the experiment for the TFIDF version, execute the following command from `/linearnet`
```bash
mkdir -p models/dianping/charbagtfidf
./archive/dianping_charbagtfidf.sh
```
### Character-Level 5-Gram Linear Model for Original Text
Before being able to execute the 5-gram experiments, we have to build the feature serialization files first.
#### Build Character-Level 5-Gram Feature Serialization Files
In this work, 5-gram features actually mean features of grams from 1 to 5. It is usually infeasible to store all of these feature in memory, and building the features coud take a significant amount of time. Therefore, we build a list of grams ranked by their frequency via a multi-threaded program first, and then build the 5-gram feature serialization files using it.
To build the list of character grams, execute the following commands from `/data/dianping`
```bash
mkdir -p ../data/dianping/train_chargram_count
th count_chargram.lua ../data/dianping/train_code.t7b ../data/dianping/train_chargram_count/
mkdir -p ../data/dianping/train_chargram_count_sort
./sort_gram_count.sh ../data/dianping/train_chargram_count ../data/dianping/train_chargram_count_sort /tmp
th combine_gram_count.lua ../data/dianping/train_chargram_count_sort/ ../data/dianping/train_chargram_count_combine.csv
./sort_gram_list.sh ../data/dianping/train_chargram_count_combine.csv ../data/dianping/train_chargram_list.csv
./limit_csv_lines.sh ../data/dianping/train_chargram_list.csv ../data/dianping/train_chargram_list_limit.csv 1000001
```
The commands proceeds by first using 10 threads to construct chunks of counts of character grams, and then sort and combine them to form the combined list. It is then sorted to list grams by its frequency, and finally we choose the 1,000,001 most frequent ones. This should be enough because we are limiting the number of features in 5-gram models to 1,000,000.
Then, you can build the character-level 5-gram feature serialization files using the following commands from `/data/dianping`
```bash
th construct_chargram.lua ../data/dianping/train_code.t7b ../data/dianping/train_chargram.t7b ../data/dianping/train_chargram_list_limit.csv
th construct_chargram.lua ../data/dianping/test_code.t7b ../data/dianping/test_chargram.t7b ../data/dianping/train_chargram_list_limit.csv
```
Note that the features for testing data are built using the gram list from the training data.
To build the feature serialization files for TFIDF version of the model, execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_chargram.t7b ../data/dianping/train_chargramtfidf.t7b ../data/dianping/train_chargram_list_limit.csv 1000000
th construct_tfidf.lua ../data/dianping/test_chargram.t7b ../data/dianping/test_chargramtfidf.t7b ../data/dianping/train_chargram_list_limit.csv 1000000
```
#### Execute the Experiments
To execute the experiment for character-level 5-gram linear model, run the following commands from `/linearnet`
```bash
mkdir -p models/dianping/chargram
./archive/dianping_chargram.sh
```
And for the TFIDF version
```bash
mkdir -p models/dianping/chargramtfidf
./archive/dianping_chargramtfidf.sh
```
### Word-Level 1-Gram Linear Model for Original Text
This section first introduces how to build bag-of-word features, and then details how to execute the experiments.
#### Build Word-Level 1-Gram Feature Serialization Files
The following commands from `/data/dianping` can create the word-level 1-gram features for linear model
```bash
th construct_wordbag.lua ../data/dianping/train_word.t7b ../data/dianping/train_wordbag.t7b 200000 200001
th construct_wordbag.lua ../data/dianping/test_word.t7b ../data/dianping/test_wordbag.t7b 200000 200001
```
This is possible because the word segmentation process previously done for word-level EmbedNet already sorts the words by its frequency from the training data. The program also automatically limit the number of features to 200000 and replace all other features to the 200001-th one.
To construct the TFIDF feature, simply execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_wordbag.t7b ../data/dianping/train_wordbagtfidf.t7b ../data/dianping/train_word_list.csv 200000
th construct_tfidf.lua ../data/dianping/test_wordbag.t7b ../data/dianping/test_wordbagtfidf.t7b ../data/dianping/train_word_list.csv 200000
```
#### Execute the Experiments
From `/linearnet`, the following commands execute the experiment for bag-of-word model
```bash
mkdir -p models/dianping/wordbag
./archive/dianping_wordbag.sh
```
And for the TFIDF version
```bash
mkdir -p models/dianping/wordbagtfidf
./archive/dianping_wordbagtfidf.sh
```
### Word-Level 5-Gram Linear Model for Original Text
This section introduces how to build word-level 5-gram feature serialization files and how to execute the experiments.
#### Build Word-Level 5-Gram Feature Serialization Files
Similar to the character-level 5-gram features, we need a multi-threaded program to build the list of grams first before being able to build the feature serialization files. The list can be built by executing the following commands from `/data/dianping`
```bash
mkdir -p ../data/dianping/train_wordgram_count
th count_wordgram.lua ../data/dianping/train_word.t7b ../data/dianping/train_wordgram_count/ ../data/dianping/train_word_list.csv
mkdir -p ../data/dianping/train_wordgram_count_sort
./sort_gram_count.sh ../data/dianping/train_wordgram_count ../data/dianping/train_wordgram_count_sort /tmp
th combine_gram_count.lua ../data/dianping/train_wordgram_count_sort/ ../data/dianping/train_wordgram_count_combine.csv
./sort_gram_list.sh ../data/dianping/train_wordgram_count_combine.csv ../data/dianping/train_wordgram_list.csv
./limit_csv_lines.sh ../data/dianping/train_wordgram_list.csv ../data/dianping/train_wordgram_list_limit.csv 1000001
```
The commands proceeds by first using 10 threads to construct chunks of counts of word grams, and then sort and combine them to form the combined list. It is then sorted to list grams by its frequency, and finally we choose the 1,000,001 most frequent ones. This should be enough because we are limiting the number of features in 5-gram models to 1,000,000.
Then, you can build the word-level 5-gram feature serialization files using the following commands from `/data/dianping`
```bash
th construct_wordgram.lua ../data/dianping/train_word.t7b ../data/dianping/train_wordgram.t7b ../data/dianping/train_wordgram_list_limit.csv
th construct_wordgram.lua ../data/dianping/test_word.t7b ../data/dianping/test_wordgram.t7b ../data/dianping/train_wordgram_list_limit.csv
```
Note that the features for testing data are built using the gram list from the training data.
To build the feature serialization files for TFIDF version of the model, execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_wordgram.t7b ../data/dianping/train_wordgramtfidf.t7b ../data/dianping/train_wordgram_list_limit.csv 1000000
th construct_tfidf.lua ../data/dianping/test_wordgram.t7b ../data/dianping/test_wordgramtfidf.t7b ../data/dianping/train_wordgram_list_limit.csv 1000000
```
#### Execute the Experiments
To execute the experiment for word-level 5-gram linear model, run the following commands from `/linearnet`
```bash
mkdir -p models/dianping/wordgram
./archive/dianping_wordgram.sh
```
And for the TFIDF version
```bash
mkdir -p models/dianping/wordgramtfidf
./archive/dianping_wordgramtfidf.sh
```
### Word-Level 1-Gram Linear Model for Romanized Text
This section first introduces how to build bag-of-word features for romanized text, and then details how to execute the experiments.
#### Build Word-Level 1-Gram Feature Serialization Files
The following commands from `/data/dianping` can create the word-level 1-gram features for romanized text
```bash
th construct_wordbag.lua ../data/dianping/train_pinyin_word.t7b ../data/dianping/train_pinyin_wordbag.t7b 200000 200001
th construct_wordbag.lua ../data/dianping/test_pinyin_word.t7b ../data/dianping/test_pinyin_wordbag.t7b 200000 200001
```
This is possible because the word segmentation process previously done for romanized word-level EmbedNet already sorts the words by its frequency from the training data. The program also automatically limit the number of features to 200000 and replace all other features to the 200001-th one.
To construct the TFIDF feature, simply execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_pinyin_wordbag.t7b ../data/dianping/train_pinyin_wordbagtfidf.t7b ../data/dianping/train_pinyin_word_list.csv 200000
th construct_tfidf.lua ../data/dianping/test_pinyin_wordbag.t7b ../data/dianping/test_pinyin_wordbagtfidf.t7b ../data/dianping/train_pinyin_word_list.csv 200000
```
#### Execute the Experiments
From `/linearnet`, the following commands execute the experiment for bag-of-word model for romanized text
```bash
mkdir -p models/dianping/wordbagroman
./archive/dianping_wordbagroman.sh
```
And for the TFIDF version
```bash
mkdir -p models/dianping/wordbagtfidfroman
./archive/dianping_wordbagtfidfroman.sh
```
### Word-Level 5-Gram Linear Model for Romanized Text
This section introduces how to build word-level 5-gram feature serialization files for romanized text and how to execute the experiments.
#### Build Word-Level 5-Gram Feature Serialization Files
Similar to the character-level 5-gram features, we need a multi-threaded program to build the list of grams first before being able to build the feature serialization files. The list can be built by executing the following commands from `/data/dianping`
```bash
mkdir -p ../data/dianping/train_pinyin_wordgram_count
th count_wordgram.lua ../data/dianping/train_pinyin_word.t7b ../data/dianping/train_pinyin_wordgram_count/ ../data/dianping/train_pinyin_word_list.csv
mkdir -p ../data/dianping/train_pinyin_wordgram_count_sort
./sort_gram_count.sh ../data/dianping/train_pinyin_wordgram_count ../data/dianping/train_pinyin_wordgram_count_sort /tmp
th combine_gram_count.lua ../data/dianping/train_pinyin_wordgram_count_sort/ ../data/dianping/train_pinyin_wordgram_count_combine.csv
./sort_gram_list.sh ../data/dianping/train_pinyin_wordgram_count_combine.csv ../data/dianping/train_pinyin_wordgram_list.csv
./limit_csv_lines.sh ../data/dianping/train_pinyin_wordgram_list.csv ../data/dianping/train_pinyin_wordgram_list_limit.csv 1000001
```
The commands proceeds by first using 10 threads to construct chunks of counts of word grams, and then sort and combine them to form the combined list. It is then sorted to list grams by its frequency, and finally we choose the 1,000,001 most frequent ones. This should be enough because we are limiting the number of features in 5-gram models to 1,000,000.
Then, you can build the word-level 5-gram feature serialization files for romanized text using the following commands from `/data/dianping`
```bash
th construct_wordgram.lua ../data/dianping/train_pinyin_word.t7b ../data/dianping/train_pinyin_wordgram.t7b ../data/dianping/trainpinyin_wordgram_list_limit.csv
th construct_wordgram.lua ../data/dianping/test_pinyin_word.t7b ../data/dianping/test_pinyin_wordgram.t7b ../data/dianping/train_pinyin_wordgram_list_limit.csv
```
Note that the features for testing data are built using the gram list from the training data.
To build the feature serialization files for TFIDF version of the model, execute the following commands from `/data/dianping`
```bash
th construct_tfidf.lua ../data/dianping/train_pinyin_wordgram.t7b ../data/dianping/train_pinyin_wordgramtfidf.t7b ../data/dianping/train_pinyin_wordgram_list_limit.csv 1000000
th construct_tfidf.lua ../data/dianping/test_pinyin_wordgram.t7b ../data/dianping/test_pinyin_wordgramtfidf.t7b ../data/dianping/train_pinyin_wordgram_list_limit.csv 1000000
```
#### Execute the Experiments
To execute the experiment for word-level 5-gram linear model, run the following commands from `/linearnet`
```bash
mkdir -p models/dianping/wordgramroman
./archive/dianping_wordgramroman.sh
```
And for the TFIDF version
```bash
mkdir -p models/dianping/wordgramtfidfroman
./archive/dianping_wordgramtfidfroman.sh
```
## fastText
This section introduces how to build the token files and run experiments for the fastText models. Note that before being able to execute the experiments in this section, you must make sure that you have [fastText](https://github.com/facebookresearch/fastText) installed and there is `fasttext` command in your `PATH`.
### Character-Level fastText for Original Text
We first build the token files for character-level fastText, and then detail how to execute the experiments.
#### Build Character-Level Token Files
To build the character token files from the original text files, execute the following commands from `/data/dianping`
```bash
th construct_chartoken.lua ../data/dianping/train.csv ../data/dianping/train_chartoken.txt
th construct_chartoken.lua ../data/dianping/test.csv ../data/dianping/test_chartoken.txt
```
Optionally, you can also build the evaluation token files by separating the training dataset to a 1:9 ratio.
```bash
./shuffle_lines.sh ../data/dianping/train_chartoken.txt ../data/dianping/train_chartoken_shuffle.txt
./split_lines.sh 1800000 ../data/dianping/train_chartoken_shuffle.txt ../data/dianping/train_chartoken_shuffle_split_
```
Note that the second command above will produce 2 files `train_chartoken_shuffle_split_0.txt` and `train_chartoken_shuffle_split_1.txt`.
#### Execute the Experiments
To execute the character-level 1-gram evaluation experiment, do the following commands from `/fasttext`
```bash
mkdir -p models/dianping/charunigram_evaluation
./archive/dianping_charunigram_evaluation.sh
```
This will iterate through 2, 5 and 10 epoches for the best option on the evaluation data. You can check whether the evaluated hyperparameter confirms with that in the paper.
To execute the character-level 1-gram experiment, use the following commands f
gitextract_d_gx1n5i/
├── LICENSE
├── README.md
├── data/
│ ├── 11st/
│ │ ├── construct_rr.py
│ │ ├── create_post.py
│ │ ├── create_review.py
│ │ ├── segment_rr_word.lua
│ │ └── segment_word.py
│ ├── README.md
│ ├── chinanews/
│ │ └── construct_topic.py
│ ├── data/
│ │ └── README.txt
│ ├── dianping/
│ │ ├── combine_gram_count.lua
│ │ ├── construct_charbag.lua
│ │ ├── construct_chargram.lua
│ │ ├── construct_chartoken.lua
│ │ ├── construct_code.lua
│ │ ├── construct_pinyin.py
│ │ ├── construct_reviews.lua
│ │ ├── construct_string.lua
│ │ ├── construct_tfidf.lua
│ │ ├── construct_word.lua
│ │ ├── construct_wordbag.lua
│ │ ├── construct_wordgram.lua
│ │ ├── construct_wordtoken.lua
│ │ ├── convert_string_code.lua
│ │ ├── count_chargram.lua
│ │ ├── count_wordgram.lua
│ │ ├── limit_code.lua
│ │ ├── limit_csvlines.sh
│ │ ├── queue.lua
│ │ ├── remove_duplication.py
│ │ ├── remove_null.sh
│ │ ├── segment_roman_word.lua
│ │ ├── segment_word.py
│ │ ├── select_data.lua
│ │ ├── shuffle_lines.sh
│ │ ├── sort_gram_count.sh
│ │ ├── sort_gram_list.sh
│ │ ├── split_lines.sh
│ │ └── split_train.lua
│ ├── ifeng/
│ │ └── construct_topic.py
│ ├── jd/
│ │ ├── count_data.lua
│ │ ├── create_comment.py
│ │ ├── limit_length.lua
│ │ └── sort_data.sh
│ ├── joint/
│ │ ├── combine_word.lua
│ │ └── combine_word_list.lua
│ ├── nytimes/
│ │ ├── construct_topic.py
│ │ └── count_class.lua
│ └── rakuten/
│ ├── construct_hepburn.py
│ ├── create_review.py
│ └── segment_word.py
├── doc/
│ └── dianping.md
├── embednet/
│ ├── archive/
│ │ ├── 11stbinary_temporal12length512feature256.sh
│ │ ├── 11stbinary_temporal12length512feature256byte.sh
│ │ ├── 11stbinary_temporal12length512feature256roman.sh
│ │ ├── 11stbinary_temporal12length512feature256romanword.sh
│ │ ├── 11stbinary_temporal12length512feature256word.sh
│ │ ├── 11stbinary_temporal8length486feature256.sh
│ │ ├── 11stbinary_temporal8length486feature256byte.sh
│ │ ├── 11stbinary_temporal8length486feature256roman.sh
│ │ ├── 11stbinary_temporal8length486feature256romanword.sh
│ │ ├── 11stbinary_temporal8length486feature256word.sh
│ │ ├── 11stfull_temporal12length512feature256.sh
│ │ ├── 11stfull_temporal12length512feature256byte.sh
│ │ ├── 11stfull_temporal12length512feature256roman.sh
│ │ ├── 11stfull_temporal12length512feature256romanword.sh
│ │ ├── 11stfull_temporal12length512feature256word.sh
│ │ ├── 11stfull_temporal8length486feature256.sh
│ │ ├── 11stfull_temporal8length486feature256byte.sh
│ │ ├── 11stfull_temporal8length486feature256roman.sh
│ │ ├── 11stfull_temporal8length486feature256romanword.sh
│ │ ├── 11stfull_temporal8length486feature256word.sh
│ │ ├── amazonbinary_temporal12length512feature256.sh
│ │ ├── amazonbinary_temporal12length512feature256word.sh
│ │ ├── amazonbinary_temporal8length486feature256.sh
│ │ ├── amazonbinary_temporal8length486feature256word.sh
│ │ ├── amazonfull_temporal12length512feature256.sh
│ │ ├── amazonfull_temporal12length512feature256word.sh
│ │ ├── amazonfull_temporal8length486feature256.sh
│ │ ├── amazonfull_temporal8length486feature256word.sh
│ │ ├── chinanews_temporal12length512feature256.sh
│ │ ├── chinanews_temporal12length512feature256byte.sh
│ │ ├── chinanews_temporal12length512feature256roman.sh
│ │ ├── chinanews_temporal12length512feature256romanword.sh
│ │ ├── chinanews_temporal12length512feature256word.sh
│ │ ├── chinanews_temporal8length486feature256.sh
│ │ ├── chinanews_temporal8length486feature256byte.sh
│ │ ├── chinanews_temporal8length486feature256roman.sh
│ │ ├── chinanews_temporal8length486feature256romanword.sh
│ │ ├── chinanews_temporal8length486feature256word.sh
│ │ ├── dianping_temporal12length512feature256.sh
│ │ ├── dianping_temporal12length512feature256byte.sh
│ │ ├── dianping_temporal12length512feature256roman.sh
│ │ ├── dianping_temporal12length512feature256romanword.sh
│ │ ├── dianping_temporal12length512feature256word.sh
│ │ ├── dianping_temporal8length486feature256.sh
│ │ ├── dianping_temporal8length486feature256byte.sh
│ │ ├── dianping_temporal8length486feature256roman.sh
│ │ ├── dianping_temporal8length486feature256romanword.sh
│ │ ├── dianping_temporal8length486feature256word.sh
│ │ ├── ifeng_temporal12length512feature256.sh
│ │ ├── ifeng_temporal12length512feature256byte.sh
│ │ ├── ifeng_temporal12length512feature256roman.sh
│ │ ├── ifeng_temporal12length512feature256romanword.sh
│ │ ├── ifeng_temporal12length512feature256word.sh
│ │ ├── ifeng_temporal8length486feature256.sh
│ │ ├── ifeng_temporal8length486feature256byte.sh
│ │ ├── ifeng_temporal8length486feature256roman.sh
│ │ ├── ifeng_temporal8length486feature256romanword.sh
│ │ ├── ifeng_temporal8length486feature256word.sh
│ │ ├── jdbinary_temporal12length512feature256.sh
│ │ ├── jdbinary_temporal12length512feature256byte.sh
│ │ ├── jdbinary_temporal12length512feature256roman.sh
│ │ ├── jdbinary_temporal12length512feature256romanword.sh
│ │ ├── jdbinary_temporal12length512feature256word.sh
│ │ ├── jdbinary_temporal8length486feature256.sh
│ │ ├── jdbinary_temporal8length486feature256byte.sh
│ │ ├── jdbinary_temporal8length486feature256roman.sh
│ │ ├── jdbinary_temporal8length486feature256romanword.sh
│ │ ├── jdbinary_temporal8length486feature256word.sh
│ │ ├── jdfull_temporal12length512feature256.sh
│ │ ├── jdfull_temporal12length512feature256byte.sh
│ │ ├── jdfull_temporal12length512feature256roman.sh
│ │ ├── jdfull_temporal12length512feature256romanword.sh
│ │ ├── jdfull_temporal12length512feature256word.sh
│ │ ├── jdfull_temporal8length486feature256.sh
│ │ ├── jdfull_temporal8length486feature256byte.sh
│ │ ├── jdfull_temporal8length486feature256roman.sh
│ │ ├── jdfull_temporal8length486feature256romanword.sh
│ │ ├── jdfull_temporal8length486feature256word.sh
│ │ ├── jointbinary_temporal12length512feature256.sh
│ │ ├── jointbinary_temporal12length512feature256byte.sh
│ │ ├── jointbinary_temporal12length512feature256roman.sh
│ │ ├── jointbinary_temporal12length512feature256romanword.sh
│ │ ├── jointbinary_temporal12length512feature256word.sh
│ │ ├── jointbinary_temporal8length486feature256.sh
│ │ ├── jointbinary_temporal8length486feature256byte.sh
│ │ ├── jointbinary_temporal8length486feature256roman.sh
│ │ ├── jointbinary_temporal8length486feature256romanword.sh
│ │ ├── jointbinary_temporal8length486feature256word.sh
│ │ ├── jointfull_temporal12length512feature256.sh
│ │ ├── jointfull_temporal12length512feature256byte.sh
│ │ ├── jointfull_temporal12length512feature256roman.sh
│ │ ├── jointfull_temporal12length512feature256romanword.sh
│ │ ├── jointfull_temporal12length512feature256word.sh
│ │ ├── jointfull_temporal8length486feature256.sh
│ │ ├── jointfull_temporal8length486feature256byte.sh
│ │ ├── jointfull_temporal8length486feature256roman.sh
│ │ ├── jointfull_temporal8length486feature256romanword.sh
│ │ ├── jointfull_temporal8length486feature256word.sh
│ │ ├── nytimes_temporal12length512feature256.sh
│ │ ├── nytimes_temporal12length512feature256word.sh
│ │ ├── nytimes_temporal8length486feature256.sh
│ │ ├── nytimes_temporal8length486feature256word.sh
│ │ ├── rakutenbinary_temporal12length512feature256.sh
│ │ ├── rakutenbinary_temporal12length512feature256byte.sh
│ │ ├── rakutenbinary_temporal12length512feature256roman.sh
│ │ ├── rakutenbinary_temporal12length512feature256romanword.sh
│ │ ├── rakutenbinary_temporal12length512feature256word.sh
│ │ ├── rakutenbinary_temporal8length486feature256.sh
│ │ ├── rakutenbinary_temporal8length486feature256byte.sh
│ │ ├── rakutenbinary_temporal8length486feature256roman.sh
│ │ ├── rakutenbinary_temporal8length486feature256romanword.sh
│ │ ├── rakutenbinary_temporal8length486feature256word.sh
│ │ ├── rakutenfull_temporal12length512feature256.sh
│ │ ├── rakutenfull_temporal12length512feature256byte.sh
│ │ ├── rakutenfull_temporal12length512feature256roman.sh
│ │ ├── rakutenfull_temporal12length512feature256romanword.sh
│ │ ├── rakutenfull_temporal12length512feature256word.sh
│ │ ├── rakutenfull_temporal8length486feature256.sh
│ │ ├── rakutenfull_temporal8length486feature256byte.sh
│ │ ├── rakutenfull_temporal8length486feature256roman.sh
│ │ ├── rakutenfull_temporal8length486feature256romanword.sh
│ │ └── rakutenfull_temporal8length486feature256word.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── unittest/
│ │ ├── data.lua
│ │ ├── driver.lua
│ │ ├── model.lua
│ │ ├── model_cudnn.lua
│ │ ├── model_cunn.lua
│ │ ├── test.lua
│ │ ├── test_cuda.lua
│ │ ├── train.lua
│ │ └── train_cuda.lua
│ └── visualizer.lua
├── fasttext/
│ └── archive/
│ ├── 11stbinary_charbigram.sh
│ ├── 11stbinary_charbigram_evaluation.sh
│ ├── 11stbinary_charbigram_tuned.sh
│ ├── 11stbinary_charpentagram.sh
│ ├── 11stbinary_charpentagram_evaluation.sh
│ ├── 11stbinary_charpentagram_tuned.sh
│ ├── 11stbinary_charunigram.sh
│ ├── 11stbinary_charunigram_evaluation.sh
│ ├── 11stbinary_charunigram_tuned.sh
│ ├── 11stbinary_wordbigram.sh
│ ├── 11stbinary_wordbigram_evaluation.sh
│ ├── 11stbinary_wordbigram_tuned.sh
│ ├── 11stbinary_wordbigramroman.sh
│ ├── 11stbinary_wordbigramroman_evaluation.sh
│ ├── 11stbinary_wordbigramroman_tuned.sh
│ ├── 11stbinary_wordpentagram.sh
│ ├── 11stbinary_wordpentagram_evaluation.sh
│ ├── 11stbinary_wordpentagram_tuned.sh
│ ├── 11stbinary_wordpentagramroman.sh
│ ├── 11stbinary_wordpentagramroman_evaluation.sh
│ ├── 11stbinary_wordpentagramroman_tuned.sh
│ ├── 11stbinary_wordunigram.sh
│ ├── 11stbinary_wordunigram_evaluation.sh
│ ├── 11stbinary_wordunigram_tuned.sh
│ ├── 11stbinary_wordunigramroman.sh
│ ├── 11stbinary_wordunigramroman_evaluation.sh
│ ├── 11stbinary_wordunigramroman_tuned.sh
│ ├── 11stfull_charbigram.sh
│ ├── 11stfull_charbigram_evaluation.sh
│ ├── 11stfull_charbigram_tuned.sh
│ ├── 11stfull_charpentagram.sh
│ ├── 11stfull_charpentagram_evaluation.sh
│ ├── 11stfull_charpentagram_tuned.sh
│ ├── 11stfull_charunigram.sh
│ ├── 11stfull_charunigram_evaluation.sh
│ ├── 11stfull_charunigram_tuned.sh
│ ├── 11stfull_wordbigram.sh
│ ├── 11stfull_wordbigram_evaluation.sh
│ ├── 11stfull_wordbigram_tuned.sh
│ ├── 11stfull_wordbigramroman.sh
│ ├── 11stfull_wordbigramroman_evaluation.sh
│ ├── 11stfull_wordbigramroman_tuned.sh
│ ├── 11stfull_wordpentagram.sh
│ ├── 11stfull_wordpentagram_evaluation.sh
│ ├── 11stfull_wordpentagram_tuned.sh
│ ├── 11stfull_wordpentagramroman.sh
│ ├── 11stfull_wordpentagramroman_evaluation.sh
│ ├── 11stfull_wordpentagramroman_tuned.sh
│ ├── 11stfull_wordunigram.sh
│ ├── 11stfull_wordunigram_evaluation.sh
│ ├── 11stfull_wordunigram_tuned.sh
│ ├── 11stfull_wordunigramroman.sh
│ ├── 11stfull_wordunigramroman_evaluation.sh
│ ├── 11stfull_wordunigramroman_tuned.sh
│ ├── amazonbinary_charbigram.sh
│ ├── amazonbinary_charbigram_evaluation.sh
│ ├── amazonbinary_charbigram_tuned.sh
│ ├── amazonbinary_charpentagram.sh
│ ├── amazonbinary_charpentagram_evaluation.sh
│ ├── amazonbinary_charpentagram_tuned.sh
│ ├── amazonbinary_charunigram.sh
│ ├── amazonbinary_charunigram_evaluation.sh
│ ├── amazonbinary_charunigram_tuned.sh
│ ├── amazonbinary_wordbigram.sh
│ ├── amazonbinary_wordbigram_evaluation.sh
│ ├── amazonbinary_wordbigram_tuned.sh
│ ├── amazonbinary_wordpentagram.sh
│ ├── amazonbinary_wordpentagram_evaluation.sh
│ ├── amazonbinary_wordpentagram_tuned.sh
│ ├── amazonbinary_wordunigram.sh
│ ├── amazonbinary_wordunigram_evaluation.sh
│ ├── amazonbinary_wordunigram_tuned.sh
│ ├── amazonfull_charbigram.sh
│ ├── amazonfull_charbigram_evaluation.sh
│ ├── amazonfull_charbigram_tuned.sh
│ ├── amazonfull_charpentagram.sh
│ ├── amazonfull_charpentagram_evaluation.sh
│ ├── amazonfull_charpentagram_tuned.sh
│ ├── amazonfull_charunigram.sh
│ ├── amazonfull_charunigram_evaluation.sh
│ ├── amazonfull_charunigram_tuned.sh
│ ├── amazonfull_wordbigram.sh
│ ├── amazonfull_wordbigram_evaluation.sh
│ ├── amazonfull_wordbigram_tuned.sh
│ ├── amazonfull_wordpentagram.sh
│ ├── amazonfull_wordpentagram_evaluation.sh
│ ├── amazonfull_wordpentagram_tuned.sh
│ ├── amazonfull_wordunigram.sh
│ ├── amazonfull_wordunigram_evaluation.sh
│ ├── amazonfull_wordunigram_tuned.sh
│ ├── chinanews_charbigram.sh
│ ├── chinanews_charbigram_evaluation.sh
│ ├── chinanews_charbigram_tuned.sh
│ ├── chinanews_charpentagram.sh
│ ├── chinanews_charpentagram_evaluation.sh
│ ├── chinanews_charpentagram_tuned.sh
│ ├── chinanews_charunigram.sh
│ ├── chinanews_charunigram_evaluation.sh
│ ├── chinanews_charunigram_tuned.sh
│ ├── chinanews_wordbigram.sh
│ ├── chinanews_wordbigram_evaluation.sh
│ ├── chinanews_wordbigram_tuned.sh
│ ├── chinanews_wordbigramroman.sh
│ ├── chinanews_wordbigramroman_evaluation.sh
│ ├── chinanews_wordbigramroman_tuned.sh
│ ├── chinanews_wordpentagram.sh
│ ├── chinanews_wordpentagram_evaluation.sh
│ ├── chinanews_wordpentagram_tuned.sh
│ ├── chinanews_wordpentagramroman.sh
│ ├── chinanews_wordpentagramroman_evaluation.sh
│ ├── chinanews_wordpentagramroman_tuned.sh
│ ├── chinanews_wordunigram.sh
│ ├── chinanews_wordunigram_evaluation.sh
│ ├── chinanews_wordunigram_tuned.sh
│ ├── chinanews_wordunigramroman.sh
│ ├── chinanews_wordunigramroman_evaluation.sh
│ ├── chinanews_wordunigramroman_tuned.sh
│ ├── dianping_charbigram.sh
│ ├── dianping_charbigram_evaluation.sh
│ ├── dianping_charbigram_tuned.sh
│ ├── dianping_charpentagram.sh
│ ├── dianping_charpentagram_evaluation.sh
│ ├── dianping_charpentagram_tuned.sh
│ ├── dianping_charunigram.sh
│ ├── dianping_charunigram_evaluation.sh
│ ├── dianping_charunigram_tuned.sh
│ ├── dianping_wordbigram.sh
│ ├── dianping_wordbigram_evaluation.sh
│ ├── dianping_wordbigram_tuned.sh
│ ├── dianping_wordbigramroman.sh
│ ├── dianping_wordbigramroman_evaluation.sh
│ ├── dianping_wordbigramroman_tuned.sh
│ ├── dianping_wordpentagram.sh
│ ├── dianping_wordpentagram_evaluation.sh
│ ├── dianping_wordpentagram_tuned.sh
│ ├── dianping_wordpentagramroman.sh
│ ├── dianping_wordpentagramroman_evaluation.sh
│ ├── dianping_wordpentagramroman_tuned.sh
│ ├── dianping_wordunigram.sh
│ ├── dianping_wordunigram_evaluation.sh
│ ├── dianping_wordunigram_tuned.sh
│ ├── dianping_wordunigramroman.sh
│ ├── dianping_wordunigramroman_evaluation.sh
│ ├── dianping_wordunigramroman_tuned.sh
│ ├── ifeng_charbigram.sh
│ ├── ifeng_charbigram_evaluation.sh
│ ├── ifeng_charbigram_tuned.sh
│ ├── ifeng_charpentagram.sh
│ ├── ifeng_charpentagram_evaluation.sh
│ ├── ifeng_charpentagram_tuned.sh
│ ├── ifeng_charunigram.sh
│ ├── ifeng_charunigram_evaluation.sh
│ ├── ifeng_charunigram_tuned.sh
│ ├── ifeng_wordbigram.sh
│ ├── ifeng_wordbigram_evaluation.sh
│ ├── ifeng_wordbigram_tuned.sh
│ ├── ifeng_wordbigramroman.sh
│ ├── ifeng_wordbigramroman_evaluation.sh
│ ├── ifeng_wordbigramroman_tuned.sh
│ ├── ifeng_wordpentagram.sh
│ ├── ifeng_wordpentagram_evaluation.sh
│ ├── ifeng_wordpentagram_tuned.sh
│ ├── ifeng_wordpentagramroman.sh
│ ├── ifeng_wordpentagramroman_evaluation.sh
│ ├── ifeng_wordpentagramroman_tuned.sh
│ ├── ifeng_wordunigram.sh
│ ├── ifeng_wordunigram_evaluation.sh
│ ├── ifeng_wordunigram_tuned.sh
│ ├── ifeng_wordunigramroman.sh
│ ├── ifeng_wordunigramroman_evaluation.sh
│ ├── ifeng_wordunigramroman_tuned.sh
│ ├── jdbinary_charbigram.sh
│ ├── jdbinary_charbigram_evaluation.sh
│ ├── jdbinary_charbigram_tuned.sh
│ ├── jdbinary_charpentagram.sh
│ ├── jdbinary_charpentagram_evaluation.sh
│ ├── jdbinary_charpentagram_tuned.sh
│ ├── jdbinary_charunigram.sh
│ ├── jdbinary_charunigram_evaluation.sh
│ ├── jdbinary_charunigram_tuned.sh
│ ├── jdbinary_wordbigram.sh
│ ├── jdbinary_wordbigram_evaluation.sh
│ ├── jdbinary_wordbigram_tuned.sh
│ ├── jdbinary_wordbigramroman.sh
│ ├── jdbinary_wordbigramroman_evaluation.sh
│ ├── jdbinary_wordbigramroman_tuned.sh
│ ├── jdbinary_wordpentagram.sh
│ ├── jdbinary_wordpentagram_evaluation.sh
│ ├── jdbinary_wordpentagram_tuned.sh
│ ├── jdbinary_wordpentagramroman.sh
│ ├── jdbinary_wordpentagramroman_evaluation.sh
│ ├── jdbinary_wordpentagramroman_tuned.sh
│ ├── jdbinary_wordunigram.sh
│ ├── jdbinary_wordunigram_evaluation.sh
│ ├── jdbinary_wordunigram_tuned.sh
│ ├── jdbinary_wordunigramroman.sh
│ ├── jdbinary_wordunigramroman_evaluation.sh
│ ├── jdbinary_wordunigramroman_tuned.sh
│ ├── jdfull_charbigram.sh
│ ├── jdfull_charbigram_evaluation.sh
│ ├── jdfull_charbigram_tuned.sh
│ ├── jdfull_charpentagram.sh
│ ├── jdfull_charpentagram_evaluation.sh
│ ├── jdfull_charpentagram_tuned.sh
│ ├── jdfull_charunigram.sh
│ ├── jdfull_charunigram_evaluation.sh
│ ├── jdfull_charunigram_tuned.sh
│ ├── jdfull_wordbigram.sh
│ ├── jdfull_wordbigram_evaluation.sh
│ ├── jdfull_wordbigram_tuned.sh
│ ├── jdfull_wordbigramroman.sh
│ ├── jdfull_wordbigramroman_evaluation.sh
│ ├── jdfull_wordbigramroman_tuned.sh
│ ├── jdfull_wordpentagram.sh
│ ├── jdfull_wordpentagram_evaluation.sh
│ ├── jdfull_wordpentagram_tuned.sh
│ ├── jdfull_wordpentagramroman.sh
│ ├── jdfull_wordpentagramroman_evaluation.sh
│ ├── jdfull_wordpentagramroman_tuned.sh
│ ├── jdfull_wordunigram.sh
│ ├── jdfull_wordunigram_evaluation.sh
│ ├── jdfull_wordunigram_tuned.sh
│ ├── jdfull_wordunigramroman.sh
│ ├── jdfull_wordunigramroman_evaluation.sh
│ ├── jdfull_wordunigramroman_tuned.sh
│ ├── jointbinary_charbigram.sh
│ ├── jointbinary_charbigram_evaluation.sh
│ ├── jointbinary_charbigram_tuned.sh
│ ├── jointbinary_charpentagram.sh
│ ├── jointbinary_charpentagram_evaluation.sh
│ ├── jointbinary_charpentagram_tuned.sh
│ ├── jointbinary_charunigram.sh
│ ├── jointbinary_charunigram_evaluation.sh
│ ├── jointbinary_charunigram_tuned.sh
│ ├── jointbinary_wordbigram.sh
│ ├── jointbinary_wordbigram_evaluation.sh
│ ├── jointbinary_wordbigram_tuned.sh
│ ├── jointbinary_wordbigramroman.sh
│ ├── jointbinary_wordbigramroman_evaluation.sh
│ ├── jointbinary_wordbigramroman_tuned.sh
│ ├── jointbinary_wordpentagram.sh
│ ├── jointbinary_wordpentagram_evaluation.sh
│ ├── jointbinary_wordpentagram_tuned.sh
│ ├── jointbinary_wordpentagramroman.sh
│ ├── jointbinary_wordpentagramroman_evaluation.sh
│ ├── jointbinary_wordpentagramroman_tuned.sh
│ ├── jointbinary_wordunigram.sh
│ ├── jointbinary_wordunigram_evaluation.sh
│ ├── jointbinary_wordunigram_tuned.sh
│ ├── jointbinary_wordunigramroman.sh
│ ├── jointbinary_wordunigramroman_evaluation.sh
│ ├── jointbinary_wordunigramroman_tuned.sh
│ ├── jointfull_charbigram.sh
│ ├── jointfull_charbigram_evaluation.sh
│ ├── jointfull_charbigram_tuned.sh
│ ├── jointfull_charpentagram.sh
│ ├── jointfull_charpentagram_evaluation.sh
│ ├── jointfull_charpentagram_tuned.sh
│ ├── jointfull_charunigram.sh
│ ├── jointfull_charunigram_evaluation.sh
│ ├── jointfull_charunigram_tuned.sh
│ ├── jointfull_wordbigram.sh
│ ├── jointfull_wordbigram_evaluation.sh
│ ├── jointfull_wordbigram_tuned.sh
│ ├── jointfull_wordbigramroman.sh
│ ├── jointfull_wordbigramroman_evaluation.sh
│ ├── jointfull_wordbigramroman_tuned.sh
│ ├── jointfull_wordpentagram.sh
│ ├── jointfull_wordpentagram_evaluation.sh
│ ├── jointfull_wordpentagram_tuned.sh
│ ├── jointfull_wordpentagramroman.sh
│ ├── jointfull_wordpentagramroman_evaluation.sh
│ ├── jointfull_wordpentagramroman_tuned.sh
│ ├── jointfull_wordunigram.sh
│ ├── jointfull_wordunigram_evaluation.sh
│ ├── jointfull_wordunigram_tuned.sh
│ ├── jointfull_wordunigramroman.sh
│ ├── jointfull_wordunigramroman_evaluation.sh
│ ├── jointfull_wordunigramroman_tuned.sh
│ ├── nytimes_charbigram.sh
│ ├── nytimes_charbigram_evaluation.sh
│ ├── nytimes_charbigram_tuned.sh
│ ├── nytimes_charpentagram.sh
│ ├── nytimes_charpentagram_evaluation.sh
│ ├── nytimes_charpentagram_tuned.sh
│ ├── nytimes_charunigram.sh
│ ├── nytimes_charunigram_evaluation.sh
│ ├── nytimes_charunigram_tuned.sh
│ ├── nytimes_wordbigram.sh
│ ├── nytimes_wordbigram_evaluation.sh
│ ├── nytimes_wordbigram_tuned.sh
│ ├── nytimes_wordpentagram.sh
│ ├── nytimes_wordpentagram_evaluation.sh
│ ├── nytimes_wordpentagram_tuned.sh
│ ├── nytimes_wordunigram.sh
│ ├── nytimes_wordunigram_evaluation.sh
│ ├── nytimes_wordunigram_tuned.sh
│ ├── rakutenbinary_charbigram.sh
│ ├── rakutenbinary_charbigram_evaluation.sh
│ ├── rakutenbinary_charbigram_tuned.sh
│ ├── rakutenbinary_charpentagram.sh
│ ├── rakutenbinary_charpentagram_evaluation.sh
│ ├── rakutenbinary_charpentagram_tuned.sh
│ ├── rakutenbinary_charunigram.sh
│ ├── rakutenbinary_charunigram_evaluation.sh
│ ├── rakutenbinary_charunigram_tuned.sh
│ ├── rakutenbinary_wordbigram.sh
│ ├── rakutenbinary_wordbigram_evaluation.sh
│ ├── rakutenbinary_wordbigram_tuned.sh
│ ├── rakutenbinary_wordbigramroman.sh
│ ├── rakutenbinary_wordbigramroman_evaluation.sh
│ ├── rakutenbinary_wordbigramroman_tuned.sh
│ ├── rakutenbinary_wordpentagram.sh
│ ├── rakutenbinary_wordpentagram_evaluation.sh
│ ├── rakutenbinary_wordpentagram_tuned.sh
│ ├── rakutenbinary_wordpentagramroman.sh
│ ├── rakutenbinary_wordpentagramroman_evaluation.sh
│ ├── rakutenbinary_wordpentagramroman_tuned.sh
│ ├── rakutenbinary_wordunigram.sh
│ ├── rakutenbinary_wordunigram_evaluation.sh
│ ├── rakutenbinary_wordunigram_tuned.sh
│ ├── rakutenbinary_wordunigramroman.sh
│ ├── rakutenbinary_wordunigramroman_evaluation.sh
│ ├── rakutenbinary_wordunigramroman_tuned.sh
│ ├── rakutenfull_charbigram.sh
│ ├── rakutenfull_charbigram_evaluation.sh
│ ├── rakutenfull_charbigram_tuned.sh
│ ├── rakutenfull_charpentagram.sh
│ ├── rakutenfull_charpentagram_evaluation.sh
│ ├── rakutenfull_charpentagram_tuned.sh
│ ├── rakutenfull_charunigram.sh
│ ├── rakutenfull_charunigram_evaluation.sh
│ ├── rakutenfull_charunigram_tuned.sh
│ ├── rakutenfull_wordbigram.sh
│ ├── rakutenfull_wordbigram_evaluation.sh
│ ├── rakutenfull_wordbigram_tuned.sh
│ ├── rakutenfull_wordbigramroman.sh
│ ├── rakutenfull_wordbigramroman_evaluation.sh
│ ├── rakutenfull_wordbigramroman_tuned.sh
│ ├── rakutenfull_wordpentagram.sh
│ ├── rakutenfull_wordpentagram_evaluation.sh
│ ├── rakutenfull_wordpentagram_tuned.sh
│ ├── rakutenfull_wordpentagramroman.sh
│ ├── rakutenfull_wordpentagramroman_evaluation.sh
│ ├── rakutenfull_wordpentagramroman_tuned.sh
│ ├── rakutenfull_wordunigram.sh
│ ├── rakutenfull_wordunigram_evaluation.sh
│ ├── rakutenfull_wordunigram_tuned.sh
│ ├── rakutenfull_wordunigramroman.sh
│ ├── rakutenfull_wordunigramroman_evaluation.sh
│ └── rakutenfull_wordunigramroman_tuned.sh
├── glyphnet/
│ ├── archive/
│ │ ├── 11stbinary_spatial6temporal8length486feature256.sh
│ │ ├── 11stbinary_spatial8temporal12length512feature256.sh
│ │ ├── 11stfull_spatial6temporal8length486feature256.sh
│ │ ├── 11stfull_spatial8temporal12length512feature256.sh
│ │ ├── amazonbinary_spatial6temporal8length486feature256.sh
│ │ ├── amazonbinary_spatial8temporal12length512feature256.sh
│ │ ├── amazonfull_spatial6temporal8length486feature256.sh
│ │ ├── amazonfull_spatial8temporal12length512feature256.sh
│ │ ├── chinanews_spatial6temporal8length486feature256.sh
│ │ ├── chinanews_spatial8temporal12length512feature256.sh
│ │ ├── dianping_spatial6temporal8length486feature256.sh
│ │ ├── dianping_spatial8temporal12length512feature256.sh
│ │ ├── ifeng_spatial6temporal8length486feature256.sh
│ │ ├── ifeng_spatial8temporal12length512feature256.sh
│ │ ├── jdbinary_spatial6temporal8length486feature256.sh
│ │ ├── jdbinary_spatial8temporal12length512feature256.sh
│ │ ├── jdfull_spatial6temporal8length486feature256.sh
│ │ ├── jdfull_spatial8temporal12length512feature256.sh
│ │ ├── jointbinary_spatial6temporal8length486feature256.sh
│ │ ├── jointbinary_spatial8temporal12length512feature256.sh
│ │ ├── jointfull_spatial6temporal8length486feature256.sh
│ │ ├── jointfull_spatial8temporal12length512feature256.sh
│ │ ├── nytimes_spatial6temporal8length486feature256.sh
│ │ ├── nytimes_spatial8temporal12length512feature256.sh
│ │ ├── rakutenbinary_spatial6temporal8length486feature256.sh
│ │ ├── rakutenbinary_spatial8temporal12length512feature256.sh
│ │ ├── rakutenfull_spatial6temporal8length486feature256.sh
│ │ └── rakutenfull_spatial8temporal12length512feature256.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── main.lua
│ ├── model.lua
│ ├── modules/
│ │ ├── TemporalConvolutionCudnn.lua
│ │ ├── TemporalConvolutionMM.lua
│ │ ├── TemporalMaxPoolingCudnn.lua
│ │ └── TemporalMaxPoolingMM.lua
│ ├── modules.lua
│ ├── scroll.lua
│ ├── scroll.ui
│ ├── test.lua
│ ├── train.lua
│ ├── unittest/
│ │ ├── data.lua
│ │ ├── driver.lua
│ │ ├── model.lua
│ │ ├── model_cuda.lua
│ │ ├── model_cudnn.lua
│ │ ├── modules_temporal.lua
│ │ ├── modules_temporal_cudnn.lua
│ │ ├── test.lua
│ │ ├── test_cuda.lua
│ │ ├── train.lua
│ │ └── train_cuda.lua
│ └── visualizer.lua
├── linearnet/
│ ├── archive/
│ │ ├── 11stbinary_charbag.sh
│ │ ├── 11stbinary_charbagtfidf.sh
│ │ ├── 11stbinary_chargram.sh
│ │ ├── 11stbinary_chargramtfidf.sh
│ │ ├── 11stbinary_wordbag.sh
│ │ ├── 11stbinary_wordbagroman.sh
│ │ ├── 11stbinary_wordbagtfidf.sh
│ │ ├── 11stbinary_wordbagtfidfroman.sh
│ │ ├── 11stbinary_wordgram.sh
│ │ ├── 11stbinary_wordgramroman.sh
│ │ ├── 11stbinary_wordgramtfidf.sh
│ │ ├── 11stbinary_wordgramtfidfroman.sh
│ │ ├── 11stfull_charbag.sh
│ │ ├── 11stfull_charbagtfidf.sh
│ │ ├── 11stfull_chargram.sh
│ │ ├── 11stfull_chargramtfidf.sh
│ │ ├── 11stfull_wordbag.sh
│ │ ├── 11stfull_wordbagroman.sh
│ │ ├── 11stfull_wordbagtfidf.sh
│ │ ├── 11stfull_wordbagtfidfroman.sh
│ │ ├── 11stfull_wordgram.sh
│ │ ├── 11stfull_wordgramroman.sh
│ │ ├── 11stfull_wordgramtfidf.sh
│ │ ├── 11stfull_wordgramtfidfroman.sh
│ │ ├── amazonbinary_charbag.sh
│ │ ├── amazonbinary_charbagtfidf.sh
│ │ ├── amazonbinary_chargram.sh
│ │ ├── amazonbinary_chargramtfidf.sh
│ │ ├── amazonbinary_wordbag.sh
│ │ ├── amazonbinary_wordbagtfidf.sh
│ │ ├── amazonbinary_wordgram.sh
│ │ ├── amazonbinary_wordgramtfidf.sh
│ │ ├── amazonfull_charbag.sh
│ │ ├── amazonfull_charbagtfidf.sh
│ │ ├── amazonfull_chargram.sh
│ │ ├── amazonfull_chargramtfidf.sh
│ │ ├── amazonfull_wordbag.sh
│ │ ├── amazonfull_wordbagtfidf.sh
│ │ ├── amazonfull_wordgram.sh
│ │ ├── amazonfull_wordgramtfidf.sh
│ │ ├── chinanews_charbag.sh
│ │ ├── chinanews_charbagtfidf.sh
│ │ ├── chinanews_chargram.sh
│ │ ├── chinanews_chargramtfidf.sh
│ │ ├── chinanews_wordbag.sh
│ │ ├── chinanews_wordbagroman.sh
│ │ ├── chinanews_wordbagtfidf.sh
│ │ ├── chinanews_wordbagtfidfroman.sh
│ │ ├── chinanews_wordgram.sh
│ │ ├── chinanews_wordgramroman.sh
│ │ ├── chinanews_wordgramtfidf.sh
│ │ ├── chinanews_wordgramtfidfroman.sh
│ │ ├── dianping_charbag.sh
│ │ ├── dianping_charbagtfidf.sh
│ │ ├── dianping_chargram.sh
│ │ ├── dianping_chargramtfidf.sh
│ │ ├── dianping_wordbag.sh
│ │ ├── dianping_wordbagroman.sh
│ │ ├── dianping_wordbagtfidf.sh
│ │ ├── dianping_wordbagtfidfroman.sh
│ │ ├── dianping_wordgram.sh
│ │ ├── dianping_wordgramroman.sh
│ │ ├── dianping_wordgramtfidf.sh
│ │ ├── dianping_wordgramtfidfroman.sh
│ │ ├── ifeng_charbag.sh
│ │ ├── ifeng_charbagtfidf.sh
│ │ ├── ifeng_chargram.sh
│ │ ├── ifeng_chargramtfidf.sh
│ │ ├── ifeng_wordbag.sh
│ │ ├── ifeng_wordbagroman.sh
│ │ ├── ifeng_wordbagtfidf.sh
│ │ ├── ifeng_wordbagtfidfroman.sh
│ │ ├── ifeng_wordgram.sh
│ │ ├── ifeng_wordgramroman.sh
│ │ ├── ifeng_wordgramtfidf.sh
│ │ ├── ifeng_wordgramtfidfroman.sh
│ │ ├── jdbinary_charbag.sh
│ │ ├── jdbinary_charbagtfidf.sh
│ │ ├── jdbinary_chargram.sh
│ │ ├── jdbinary_chargramtfidf.sh
│ │ ├── jdbinary_wordbag.sh
│ │ ├── jdbinary_wordbagroman.sh
│ │ ├── jdbinary_wordbagtfidf.sh
│ │ ├── jdbinary_wordbagtfidfroman.sh
│ │ ├── jdbinary_wordgram.sh
│ │ ├── jdbinary_wordgramroman.sh
│ │ ├── jdbinary_wordgramtfidf.sh
│ │ ├── jdbinary_wordgramtfidfroman.sh
│ │ ├── jdfull_charbag.sh
│ │ ├── jdfull_charbagtfidf.sh
│ │ ├── jdfull_chargram.sh
│ │ ├── jdfull_chargramtfidf.sh
│ │ ├── jdfull_wordbag.sh
│ │ ├── jdfull_wordbagroman.sh
│ │ ├── jdfull_wordbagtfidf.sh
│ │ ├── jdfull_wordbagtfidfroman.sh
│ │ ├── jdfull_wordgram.sh
│ │ ├── jdfull_wordgramroman.sh
│ │ ├── jdfull_wordgramtfidf.sh
│ │ ├── jdfull_wordgramtfidfroman.sh
│ │ ├── jointbinary_charbag.sh
│ │ ├── jointbinary_charbagtfidf.sh
│ │ ├── jointbinary_chargram.sh
│ │ ├── jointbinary_chargramtfidf.sh
│ │ ├── jointbinary_wordbag.sh
│ │ ├── jointbinary_wordbagroman.sh
│ │ ├── jointbinary_wordbagtfidf.sh
│ │ ├── jointbinary_wordbagtfidfroman.sh
│ │ ├── jointbinary_wordgram.sh
│ │ ├── jointbinary_wordgramroman.sh
│ │ ├── jointbinary_wordgramtfidf.sh
│ │ ├── jointbinary_wordgramtfidfroman.sh
│ │ ├── jointfull_charbag.sh
│ │ ├── jointfull_charbagtfidf.sh
│ │ ├── jointfull_chargram.sh
│ │ ├── jointfull_chargramtfidf.sh
│ │ ├── jointfull_wordbag.sh
│ │ ├── jointfull_wordbagroman.sh
│ │ ├── jointfull_wordbagtfidf.sh
│ │ ├── jointfull_wordbagtfidfroman.sh
│ │ ├── jointfull_wordgram.sh
│ │ ├── jointfull_wordgramroman.sh
│ │ ├── jointfull_wordgramtfidf.sh
│ │ ├── jointfull_wordgramtfidfroman.sh
│ │ ├── nytimes_charbag.sh
│ │ ├── nytimes_charbagtfidf.sh
│ │ ├── nytimes_chargram.sh
│ │ ├── nytimes_chargramtfidf.sh
│ │ ├── nytimes_wordbag.sh
│ │ ├── nytimes_wordbagtfidf.sh
│ │ ├── nytimes_wordgram.sh
│ │ ├── nytimes_wordgramtfidf.sh
│ │ ├── rakutenbinary_charbag.sh
│ │ ├── rakutenbinary_charbagtfidf.sh
│ │ ├── rakutenbinary_chargram.sh
│ │ ├── rakutenbinary_chargramtfidf.sh
│ │ ├── rakutenbinary_wordbag.sh
│ │ ├── rakutenbinary_wordbagroman.sh
│ │ ├── rakutenbinary_wordbagtfidf.sh
│ │ ├── rakutenbinary_wordbagtfidfroman.sh
│ │ ├── rakutenbinary_wordgram.sh
│ │ ├── rakutenbinary_wordgramroman.sh
│ │ ├── rakutenbinary_wordgramtfidf.sh
│ │ ├── rakutenbinary_wordgramtfidfroman.sh
│ │ ├── rakutenfull_charbag.sh
│ │ ├── rakutenfull_charbagtfidf.sh
│ │ ├── rakutenfull_chargram.sh
│ │ ├── rakutenfull_chargramtfidf.sh
│ │ ├── rakutenfull_wordbag.sh
│ │ ├── rakutenfull_wordbagroman.sh
│ │ ├── rakutenfull_wordbagtfidf.sh
│ │ ├── rakutenfull_wordbagtfidfroman.sh
│ │ ├── rakutenfull_wordgram.sh
│ │ ├── rakutenfull_wordgramroman.sh
│ │ ├── rakutenfull_wordgramtfidf.sh
│ │ └── rakutenfull_wordgramtfidfroman.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── queue.lua
│ ├── test.lua
│ ├── train.lua
│ └── unittest/
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── test.lua
│ └── train.lua
├── models/
│ ├── README.txt
│ ├── embednet/
│ │ └── README.txt
│ ├── fasttext/
│ │ └── README.txt
│ ├── glyphnet/
│ │ └── README.txt
│ ├── linearnet/
│ │ └── README.txt
│ └── onehotnet/
│ └── README.txt
├── onehotnet/
│ ├── archive/
│ │ ├── 11stbinary_onehot4temporal12length2048feature256.sh
│ │ ├── 11stbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── 11stbinary_onehot4temporal8length1944feature256.sh
│ │ ├── 11stbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── 11stfull_onehot4temporal12length2048feature256.sh
│ │ ├── 11stfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── 11stfull_onehot4temporal8length1944feature256.sh
│ │ ├── 11stfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── amazonbinary_onehot4temporal12length2048feature256.sh
│ │ ├── amazonbinary_onehot4temporal8length1944feature256.sh
│ │ ├── amazonfull_onehot4temporal12length2048feature256.sh
│ │ ├── amazonfull_onehot4temporal8length1944feature256.sh
│ │ ├── chinanews_onehot4temporal12length2048feature256.sh
│ │ ├── chinanews_onehot4temporal12length2048feature256roman.sh
│ │ ├── chinanews_onehot4temporal8length1944feature256.sh
│ │ ├── chinanews_onehot4temporal8length1944feature256roman.sh
│ │ ├── dianping_onehot4temporal12length2048feature256.sh
│ │ ├── dianping_onehot4temporal12length2048feature256roman.sh
│ │ ├── dianping_onehot4temporal8length1944feature256.sh
│ │ ├── dianping_onehot4temporal8length1944feature256roman.sh
│ │ ├── ifeng_onehot4temporal12length2048feature256.sh
│ │ ├── ifeng_onehot4temporal12length2048feature256roman.sh
│ │ ├── ifeng_onehot4temporal8length1944feature256.sh
│ │ ├── ifeng_onehot4temporal8length1944feature256roman.sh
│ │ ├── jdbinary_onehot4temporal12length2048feature256.sh
│ │ ├── jdbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── jdbinary_onehot4temporal8length1944feature256.sh
│ │ ├── jdbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── jdfull_onehot4temporal12length2048feature256.sh
│ │ ├── jdfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── jdfull_onehot4temporal8length1944feature256.sh
│ │ ├── jdfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── jointbinary_onehot4temporal12length2048feature256.sh
│ │ ├── jointbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── jointbinary_onehot4temporal8length1944feature256.sh
│ │ ├── jointbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── jointfull_onehot4temporal12length2048feature256.sh
│ │ ├── jointfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── jointfull_onehot4temporal8length1944feature256.sh
│ │ ├── jointfull_onehot4temporal8length1944feature256roman.sh
│ │ ├── nytimes_onehot4temporal12length2048feature256.sh
│ │ ├── nytimes_onehot4temporal8length1944feature256.sh
│ │ ├── rakutenbinary_onehot4temporal12length2048feature256.sh
│ │ ├── rakutenbinary_onehot4temporal12length2048feature256roman.sh
│ │ ├── rakutenbinary_onehot4temporal8length1944feature256.sh
│ │ ├── rakutenbinary_onehot4temporal8length1944feature256roman.sh
│ │ ├── rakutenfull_onehot4temporal12length2048feature256.sh
│ │ ├── rakutenfull_onehot4temporal12length2048feature256roman.sh
│ │ ├── rakutenfull_onehot4temporal8length1944feature256.sh
│ │ └── rakutenfull_onehot4temporal8length1944feature256roman.sh
│ ├── config.lua
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ └── unittest/
│ ├── data.lua
│ ├── driver.lua
│ ├── model.lua
│ ├── model_cuda.lua
│ ├── model_cudnn.lua
│ ├── test.lua
│ ├── test_cuda.lua
│ ├── train.lua
│ └── train_cuda.lua
└── unifont/
├── createunifont.lua
├── unifont/
│ └── README.txt
└── visualize.lua
SYMBOL INDEX (39 symbols across 14 files) FILE: data/11st/construct_rr.py function main (line 25) | def main(): function romanizeText (line 43) | def romanizeText(transliter, text): function convertRoman (line 51) | def convertRoman(transliter): FILE: data/11st/create_post.py function main (line 19) | def main(): function createData (line 36) | def createData(): FILE: data/11st/create_review.py function main (line 19) | def main(): function createData (line 36) | def createData(): FILE: data/11st/segment_word.py function main (line 27) | def main(): function readWords (line 59) | def readWords(): function segmentWords (line 76) | def segmentWords(): function sortWords (line 106) | def sortWords(word_count, word_freq): function convertWords (line 130) | def convertWords(word_index): FILE: data/chinanews/construct_topic.py function main (line 20) | def main(): function createData (line 41) | def createData(): FILE: data/dianping/construct_pinyin.py function main (line 21) | def main(): function convertPinyin (line 38) | def convertPinyin(): FILE: data/dianping/remove_duplication.py function main (line 25) | def main(): function removeDuplicate (line 42) | def removeDuplicate(): FILE: data/dianping/segment_word.py function main (line 24) | def main(): function readWords (line 56) | def readWords(): function segmentWords (line 73) | def segmentWords(): function sortWords (line 102) | def sortWords(word_count, word_freq): function convertWords (line 126) | def convertWords(word_index): FILE: data/ifeng/construct_topic.py function main (line 27) | def main(): function createData (line 44) | def createData(): FILE: data/jd/create_comment.py function main (line 19) | def main(): function createData (line 36) | def createData(): FILE: data/nytimes/construct_topic.py function main (line 22) | def main(): function createData (line 43) | def createData(): FILE: data/rakuten/construct_hepburn.py function main (line 22) | def main(): function romanizeText (line 40) | def romanizeText(mecab, text): function convertRoman (line 55) | def convertRoman(mecab): FILE: data/rakuten/create_review.py function main (line 19) | def main(): function createData (line 36) | def createData(): FILE: data/rakuten/segment_word.py function main (line 24) | def main(): function readWords (line 56) | def readWords(): function segmentWords (line 73) | def segmentWords(): function sortWords (line 109) | def sortWords(word_count, word_freq): function convertWords (line 133) | def convertWords(word_index):
Condensed preview — 833 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (899K chars).
[
{
"path": "LICENSE",
"chars": 1511,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2017, Xiang Zhang\nAll rights reserved.\n\nRedistribution and use in source and binary "
},
{
"path": "README.md",
"chars": 3992,
"preview": "# Glyph\n\nThis repository is used to publish all the code used for the following article:\n\n[Xiang Zhang, Yann LeCun, Whic"
},
{
"path": "data/11st/construct_rr.py",
"chars": 1968,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Korean datasets to Revised Romanization of Korean (RR, MC2000)\nCopyright 2016 Xiang Zhan"
},
{
"path": "data/11st/create_post.py",
"chars": 1815,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of reviews\nCopyright 2016 Xiang Zhang\n\nUsage: "
},
{
"path": "data/11st/create_review.py",
"chars": 1821,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of reviews\nCopyright 2016 Xiang Zhang\n\nUsage: "
},
{
"path": "data/11st/segment_rr_word.lua",
"chars": 5898,
"preview": "--[[\nCreate romananized word data from romanized data in csv for Korean\nCopyright 2016 Xiang Zhang\n\nUsage: th segment_rr"
},
{
"path": "data/11st/segment_word.py",
"chars": 4964,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Korean datasets to Index of Words\nCopyright 2016 Xiang Zhang\n\nUsage: python3 construct_p"
},
{
"path": "data/README.md",
"chars": 174,
"preview": "# Datasets\n\nThis directory contains the preprocessing scripts for all the datasets used in the paper. These datasets are"
},
{
"path": "data/chinanews/construct_topic.py",
"chars": 2463,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of news articles\nCopyright 2016 Xiang Zhang\n\nU"
},
{
"path": "data/data/README.txt",
"chars": 61,
"preview": "This directory should contain training and testing datasets.\n"
},
{
"path": "data/dianping/combine_gram_count.lua",
"chars": 3298,
"preview": "--[[\nCombine sorted gram counts\nCopyright 2016 Xiang Zhang\n\nUsage: th combine_gram_count.lua [input_prefix] [output] [sa"
},
{
"path": "data/dianping/construct_charbag.lua",
"chars": 7228,
"preview": "--[[\nConstruct unicode character bag-of-element format from unicode serialization\nCopyright 2016 Xiang Zhang\n\nUsage: th "
},
{
"path": "data/dianping/construct_chargram.lua",
"chars": 10541,
"preview": "--[[\nConstruct unicode character ngrams format from unicode serialization\nCopyright 2016 Xiang Zhang\n\nUsage: th construc"
},
{
"path": "data/dianping/construct_chartoken.lua",
"chars": 4189,
"preview": "--[[\nCreate chartoken format for fastText\nCopyright 2017 Xiang Zhang\n\nUsage: th construct_chartoken.lua [input] [output]"
},
{
"path": "data/dianping/construct_code.lua",
"chars": 3791,
"preview": "--[[\nConstruct unicode serialization format from string serialization format\nCopyright 2015-2016 Xiang Zhang\n\nUsage: th "
},
{
"path": "data/dianping/construct_pinyin.py",
"chars": 1689,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Chinese datasets to Pinyin format\nCopyright 2016 Xiang Zhang\n\nUsage: python3 construct_p"
},
{
"path": "data/dianping/construct_reviews.lua",
"chars": 1414,
"preview": "--[[\nCreate reviews in csv format from original txt file\nCopyright 2015-2016 Xiang Zhang\n\nUsage: th construct_reviews [i"
},
{
"path": "data/dianping/construct_string.lua",
"chars": 4178,
"preview": "--[[\nCreate string serialization format from csv files\nCopyright 2015-2016 Xiang Zhang\n\nUsage: th construct_string.lua ["
},
{
"path": "data/dianping/construct_tfidf.lua",
"chars": 3928,
"preview": "--[[\nConstruct tfidf format from bag format\nCopyright 2016 Xiang Zhang\n\nUsage: th construct_tfidf.lua [input] [output] ["
},
{
"path": "data/dianping/construct_word.lua",
"chars": 4303,
"preview": "--[[\nCreate word serialization format from csv files\nCopyright 2015-2016 Xiang Zhang\n\nUsage: th construct_word.lua [inpu"
},
{
"path": "data/dianping/construct_wordbag.lua",
"chars": 3998,
"preview": "--[[\nConstruct word bag-of-element format\nCopyright 2016 Xiang Zhang\n\nUsage: th construct_wordbag.lua [input] [output] ["
},
{
"path": "data/dianping/construct_wordgram.lua",
"chars": 7733,
"preview": "--[[\nConstructngrams format from serialization\nCopyright 2016 Xiang Zhang\n\nUsage: th construct_wordgram.lua [input] [out"
},
{
"path": "data/dianping/construct_wordtoken.lua",
"chars": 3392,
"preview": "--[[\nConstruct word token format from csv files\nCopyright 2017 Xiang Zhang\n\nUsage: th construct_wordtoken [input] [list]"
},
{
"path": "data/dianping/convert_string_code.lua",
"chars": 772,
"preview": "--[[\nConvert string serialization to code\nCopyright 2016 Xiang Zhang\n\nUsage: th convert_string_code.lua [input] [output]"
},
{
"path": "data/dianping/count_chargram.lua",
"chars": 8897,
"preview": "--[[\nParallelized chargram counting program\nCopyright Xiang Zhang 2016\n\nUsage: th count_chargram.lua [input] [output_pre"
},
{
"path": "data/dianping/count_wordgram.lua",
"chars": 10770,
"preview": "--[[\nParallelized wordgram counting program\nCopyright Xiang Zhang 2016\n\nUsage: th count_wordgram.lua [input] [output_pre"
},
{
"path": "data/dianping/limit_code.lua",
"chars": 929,
"preview": "--[[\nLimit the maximum code value\nCopyright 2016 Xiang Zhang\n\nUsage: th limit_code.lua [input] [output] [limit]\n--]]\n\nlo"
},
{
"path": "data/dianping/limit_csvlines.sh",
"chars": 198,
"preview": "#!/bin/bash\n\n# Limit csv files to designated number of lines\n# Copyright 2015 Xiang Zhang\n#\n# Usage: bash limit_csvlines"
},
{
"path": "data/dianping/queue.lua",
"chars": 3487,
"preview": "--[[\nMultithreaded queue based on tds\nCopyright 2015 Xiang Zhang\n--]]\n\nlocal class = require('pl.class')\nlocal ffi = req"
},
{
"path": "data/dianping/remove_duplication.py",
"chars": 1654,
"preview": "#!/usr/bin/python3\n\n'''\nRemove duplication from csv format file\nCopyright 2015 Xiang Zhang\n\nUsage: python3 remove_duplic"
},
{
"path": "data/dianping/remove_null.sh",
"chars": 166,
"preview": "#!/bin/bash\n\n# Remove NULL character from file\n# Copyright 2015 Xiang Zhang\n#\n# Usage: bash remove_null.sh [input] [outp"
},
{
"path": "data/dianping/segment_roman_word.lua",
"chars": 5681,
"preview": "--[[\nCreate romananized word data from romanized data in csv\nCopyright 2016 Xiang Zhang\n\nUsage: th segment_roman_word.lu"
},
{
"path": "data/dianping/segment_word.py",
"chars": 4736,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Chinese datasets to Index of Words\nCopyright 2016 Xiang Zhang\n\nUsage: python3 segment_wo"
},
{
"path": "data/dianping/select_data.lua",
"chars": 3162,
"preview": "--[[\nSelect data from non-duplicate datasets\nCopyright 2015 Xiang Zhang\n\nUsage: th select_data.lua [count] [input] [outp"
},
{
"path": "data/dianping/shuffle_lines.sh",
"chars": 155,
"preview": "#!/bin/bash\n\n# Shuffle lines in a text file\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash shuffle_lines.sh [input] [outpu"
},
{
"path": "data/dianping/sort_gram_count.sh",
"chars": 295,
"preview": "#!/bin/bash\n\n# Sort distributed grams file\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash sort_gram_count.sh [input_direct"
},
{
"path": "data/dianping/sort_gram_list.sh",
"chars": 250,
"preview": "#!/bin/bash\n\n# Sort list of grams and cut the count\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash sort_gram_list.sh [inpu"
},
{
"path": "data/dianping/split_lines.sh",
"chars": 256,
"preview": "#!/bin/bash\n\n# Split lines in a text file\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash split_lines.sh [lines] [input] [o"
},
{
"path": "data/dianping/split_train.lua",
"chars": 3195,
"preview": "--[[\nSplit data into training and testing subsets\nCopyright 2015 Xiang Zhang\n\nUsage: th split_train [count] [input] [tra"
},
{
"path": "data/ifeng/construct_topic.py",
"chars": 2219,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of news articles\nCopyright 2016 Xiang Zhang\n\nU"
},
{
"path": "data/jd/count_data.lua",
"chars": 3456,
"preview": "--[[\nCount data for each class and length\nCopyright 2016 Xiang Zhang\n\nUsage: th count_data.lua [input] [output]\n--]]\n\nlo"
},
{
"path": "data/jd/create_comment.py",
"chars": 1860,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of comments\nCopyright 2016 Xiang Zhang\n\nUsage:"
},
{
"path": "data/jd/limit_length.lua",
"chars": 2576,
"preview": "--[[\nLimit length for data\nCopyright 2016 Xiang Zhang\n\nUsage: th limit_length.lua [input] [output] [min] [max]\n--]]\n\n-- "
},
{
"path": "data/jd/sort_data.sh",
"chars": 255,
"preview": "#!/bin/bash\n\n# Sort comma-separated file starting from the second field\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash sor"
},
{
"path": "data/joint/combine_word.lua",
"chars": 4066,
"preview": "--[[\nCombine two word data together\nCopyright 2016 Xiang Zhang\n\nUsage: th combine_word_list.lua [input_1] [list_1] [inpu"
},
{
"path": "data/joint/combine_word_list.lua",
"chars": 4420,
"preview": "--[[\nCombine two word data together\nCopyright 2016 Xiang Zhang\n\nUsage: th combine_word_list.lua [list_1] [size_1] [list_"
},
{
"path": "data/nytimes/construct_topic.py",
"chars": 3053,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of news articles\nCopyright 2016 Xiang Zhang\n\nU"
},
{
"path": "data/nytimes/count_class.lua",
"chars": 2474,
"preview": "--[[\nCount data for each class and length\nCopyright 2016 Xiang Zhang\n\nUsage: th count_data.lua [input] [output]\n--]]\n\nlo"
},
{
"path": "data/rakuten/construct_hepburn.py",
"chars": 2178,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Japanese datasets to Hepburn Romanization\nCopyright 2016 Xiang Zhang\n\nUsage: python3 con"
},
{
"path": "data/rakuten/create_review.py",
"chars": 1825,
"preview": "#!/usr/bin/python3\n\n'''\nCreate data from list of LZMA compressed archives of reviews\nCopyright 2016 Xiang Zhang\n\nUsage: "
},
{
"path": "data/rakuten/segment_word.py",
"chars": 5385,
"preview": "#!/usr/bin/python3\n\n'''\nConvert Japanese datasets to Index of Words\nCopyright 2016 Xiang Zhang\n\nUsage: python3 construct"
},
{
"path": "doc/dianping.md",
"chars": 37342,
"preview": "# Dianping\n\nThis documentation contains information on how to reproduce all the results for the `Dianping` datasets in t"
},
{
"path": "embednet/archive/11stbinary_temporal12length512feature256.sh",
"chars": 357,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal12length512feature256byte.sh",
"chars": 469,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal12length512feature256roman.sh",
"chars": 476,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal12length512feature256romanword.sh",
"chars": 462,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal12length512feature256word.sh",
"chars": 451,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal8length486feature256.sh",
"chars": 380,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal8length486feature256byte.sh",
"chars": 492,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal8length486feature256roman.sh",
"chars": 499,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal8length486feature256romanword.sh",
"chars": 485,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stbinary_temporal8length486feature256word.sh",
"chars": 474,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal12length512feature256.sh",
"chars": 351,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal12length512feature256byte.sh",
"chars": 463,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal12length512feature256roman.sh",
"chars": 470,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal12length512feature256romanword.sh",
"chars": 456,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal12length512feature256word.sh",
"chars": 445,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal8length486feature256.sh",
"chars": 374,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal8length486feature256byte.sh",
"chars": 486,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal8length486feature256roman.sh",
"chars": 493,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal8length486feature256romanword.sh",
"chars": 479,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/11stfull_temporal8length486feature256word.sh",
"chars": 468,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonbinary_temporal12length512feature256.sh",
"chars": 343,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonbinary_temporal12length512feature256word.sh",
"chars": 437,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonbinary_temporal8length486feature256.sh",
"chars": 366,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonbinary_temporal8length486feature256word.sh",
"chars": 460,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonfull_temporal12length512feature256.sh",
"chars": 337,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonfull_temporal12length512feature256word.sh",
"chars": 431,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonfull_temporal8length486feature256.sh",
"chars": 360,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/amazonfull_temporal8length486feature256word.sh",
"chars": 454,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal12length512feature256.sh",
"chars": 344,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal12length512feature256byte.sh",
"chars": 456,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal12length512feature256roman.sh",
"chars": 471,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal12length512feature256romanword.sh",
"chars": 457,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal12length512feature256word.sh",
"chars": 438,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal8length486feature256.sh",
"chars": 367,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal8length486feature256byte.sh",
"chars": 479,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal8length486feature256roman.sh",
"chars": 494,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal8length486feature256romanword.sh",
"chars": 480,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/chinanews_temporal8length486feature256word.sh",
"chars": 461,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal12length512feature256.sh",
"chars": 176,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal12length512feature256byte.sh",
"chars": 455,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal12length512feature256roman.sh",
"chars": 470,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal12length512feature256romanword.sh",
"chars": 442,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal12length512feature256word.sh",
"chars": 423,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal8length486feature256.sh",
"chars": 262,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal8length486feature256byte.sh",
"chars": 478,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal8length486feature256roman.sh",
"chars": 493,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal8length486feature256romanword.sh",
"chars": 465,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/dianping_temporal8length486feature256word.sh",
"chars": 446,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal12length512feature256.sh",
"chars": 332,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal12length512feature256byte.sh",
"chars": 444,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal12length512feature256roman.sh",
"chars": 459,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal12length512feature256romanword.sh",
"chars": 445,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal12length512feature256word.sh",
"chars": 426,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal8length486feature256.sh",
"chars": 355,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal8length486feature256byte.sh",
"chars": 467,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal8length486feature256roman.sh",
"chars": 482,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal8length486feature256romanword.sh",
"chars": 468,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/ifeng_temporal8length486feature256word.sh",
"chars": 449,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal12length512feature256.sh",
"chars": 351,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal12length512feature256byte.sh",
"chars": 463,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal12length512feature256roman.sh",
"chars": 478,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal12length512feature256romanword.sh",
"chars": 464,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal12length512feature256word.sh",
"chars": 445,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal8length486feature256.sh",
"chars": 374,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal8length486feature256byte.sh",
"chars": 486,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal8length486feature256roman.sh",
"chars": 501,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal8length486feature256romanword.sh",
"chars": 487,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdbinary_temporal8length486feature256word.sh",
"chars": 468,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal12length512feature256.sh",
"chars": 345,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal12length512feature256byte.sh",
"chars": 457,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal12length512feature256roman.sh",
"chars": 472,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal12length512feature256romanword.sh",
"chars": 458,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal12length512feature256word.sh",
"chars": 439,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal8length486feature256.sh",
"chars": 368,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal8length486feature256byte.sh",
"chars": 480,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal8length486feature256roman.sh",
"chars": 495,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal8length486feature256romanword.sh",
"chars": 481,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jdfull_temporal8length486feature256word.sh",
"chars": 462,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal12length512feature256.sh",
"chars": 361,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal12length512feature256byte.sh",
"chars": 473,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal12length512feature256roman.sh",
"chars": 486,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal12length512feature256romanword.sh",
"chars": 472,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal12length512feature256word.sh",
"chars": 455,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal8length486feature256.sh",
"chars": 384,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal8length486feature256byte.sh",
"chars": 496,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal8length486feature256roman.sh",
"chars": 509,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal8length486feature256romanword.sh",
"chars": 495,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointbinary_temporal8length486feature256word.sh",
"chars": 478,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal12length512feature256.sh",
"chars": 355,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal12length512feature256byte.sh",
"chars": 467,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal12length512feature256roman.sh",
"chars": 480,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal12length512feature256romanword.sh",
"chars": 466,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal12length512feature256word.sh",
"chars": 449,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal8length486feature256.sh",
"chars": 378,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal8length486feature256byte.sh",
"chars": 490,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal8length486feature256roman.sh",
"chars": 503,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal8length486feature256romanword.sh",
"chars": 489,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/jointfull_temporal8length486feature256word.sh",
"chars": 472,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/nytimes_temporal12length512feature256.sh",
"chars": 338,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/nytimes_temporal12length512feature256word.sh",
"chars": 432,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/nytimes_temporal8length486feature256.sh",
"chars": 361,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/nytimes_temporal8length486feature256word.sh",
"chars": 455,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal12length512feature256.sh",
"chars": 366,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal12length512feature256byte.sh",
"chars": 478,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal12length512feature256roman.sh",
"chars": 495,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal12length512feature256romanword.sh",
"chars": 481,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal12length512feature256word.sh",
"chars": 460,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal8length486feature256.sh",
"chars": 389,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal8length486feature256byte.sh",
"chars": 501,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal8length486feature256roman.sh",
"chars": 518,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal8length486feature256romanword.sh",
"chars": 504,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenbinary_temporal8length486feature256word.sh",
"chars": 483,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal12length512feature256.sh",
"chars": 360,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal12length512feature256byte.sh",
"chars": 472,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal12length512feature256roman.sh",
"chars": 489,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal12length512feature256romanword.sh",
"chars": 475,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal12length512feature256word.sh",
"chars": 454,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal8length486feature256.sh",
"chars": 383,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal8length486feature256byte.sh",
"chars": 495,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal8length486feature256roman.sh",
"chars": 512,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal8length486feature256romanword.sh",
"chars": 498,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/archive/rakutenfull_temporal8length486feature256word.sh",
"chars": 477,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2016 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "embednet/config.lua",
"chars": 6980,
"preview": "--[[\nConfiguration for EmbedNet\nCopyright Xiang Zhang 2016\n--]]\n\n-- Name space\nlocal config = {}\n\n-- Training data confi"
},
{
"path": "embednet/data.lua",
"chars": 1631,
"preview": "--[[\nData class for Embedding Net\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal class = require('pl.class')\nlocal torch = requi"
},
{
"path": "embednet/driver.lua",
"chars": 1608,
"preview": "--[[\nDriver for EmbedNet training\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal class = require('pl.class')\n\nlocal parent = req"
},
{
"path": "embednet/model.lua",
"chars": 4956,
"preview": "--[[\nModel for EmbedNet\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal class = require('pl.class')\nlocal nn = require('nn')\n\nloc"
},
{
"path": "embednet/unittest/data.lua",
"chars": 1548,
"preview": "--[[\nUnit test for EmbedNet data component\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Data = require('data')\n\n-- A Logic Nam"
},
{
"path": "embednet/unittest/driver.lua",
"chars": 1029,
"preview": "--[[\nUnit test for EmbedNet driver component\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Driver = require('driver')\n\n-- A Lo"
},
{
"path": "embednet/unittest/model.lua",
"chars": 2943,
"preview": "--[[\nUnit Test for EmbedNet model\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Model = require('model')\n\nlocal sys = require('"
},
{
"path": "embednet/unittest/model_cudnn.lua",
"chars": 3088,
"preview": "--[[\nUnit Test for EmbedNet model\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Model = require('model')\n\nlocal cutorch = requi"
},
{
"path": "embednet/unittest/model_cunn.lua",
"chars": 3087,
"preview": "--[[\nUnit Test for EmbedNet model\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Model = require('model')\n\nlocal cutorch = requi"
},
{
"path": "embednet/unittest/test.lua",
"chars": 1765,
"preview": "--[[\nUnit test for EmbedNet test component\nCopyright 2015-2016 Xiang Zhang\n--]]\n\nlocal Test = require('test')\n\nlocal nn "
},
{
"path": "embednet/unittest/test_cuda.lua",
"chars": 1892,
"preview": "--[[\nUnit test for EmbedNet test component\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal Test = require('test')\n\nlocal cutorch "
},
{
"path": "embednet/unittest/train.lua",
"chars": 2017,
"preview": "--[[\nUnit test for EmbedNet train component\nCopyright 2015-2016 Xiang Zhang\n--]]\n\nlocal Train = require('train')\n\nlocal "
},
{
"path": "embednet/unittest/train_cuda.lua",
"chars": 2156,
"preview": "--[[\nUnit test for EmbedNet train component\nCopyright 2015-2016 Xiang Zhang\n--]]\n\nlocal Train = require('train')\n\nlocal "
},
{
"path": "embednet/visualizer.lua",
"chars": 283,
"preview": "--[[\nVisualization module for EmbedNet\nCopyright 2016 Xiang Zhang\n--]]\n\nlocal class = require('pl.class')\n\nlocal parent "
},
{
"path": "fasttext/archive/11stbinary_charbigram.sh",
"chars": 565,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charbigram_evaluation.sh",
"chars": 938,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charbigram_tuned.sh",
"chars": 571,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charpentagram.sh",
"chars": 568,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charpentagram_evaluation.sh",
"chars": 941,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charpentagram_tuned.sh",
"chars": 573,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charunigram.sh",
"chars": 566,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charunigram_evaluation.sh",
"chars": 939,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_charunigram_tuned.sh",
"chars": 572,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_wordbigram.sh",
"chars": 565,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_wordbigram_evaluation.sh",
"chars": 938,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
},
{
"path": "fasttext/archive/11stbinary_wordbigram_tuned.sh",
"chars": 570,
"preview": "#!/bin/bash\n\n# Archived program command-line for experiment\n# Copyright 2017 Xiang Zhang\n#\n# Usage: bash {this_file} [ad"
}
]
// ... and 633 more files (download for full content)
About this extraction
This page contains the full source code of the zhangxiangxiao/glyph GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 833 files (789.9 KB), approximately 253.1k tokens, and a symbol index with 39 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.