Full Code of Happy-zyy/Competition for AI

master bceb130a7261 cached

45 files

220.1 KB

59.5k tokens

224 symbols

1 requests

Download .txt

Showing preview only (241K chars total). Download the full file or copy to clipboard to get everything.

Repository: Happy-zyy/Competition
Branch: master
Commit: bceb130a7261
Files: 45
Total size: 220.1 KB

Directory structure:
gitextract_uf73lln7/

├── ReadMe.md
└── zhihu-text-classification-master/
    ├── data_process/
    │   ├── .idea/
    │   │   ├── .name
    │   │   ├── data_process.iml
    │   │   ├── deployment.xml
    │   │   ├── encodings.xml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── README.md
    │   ├── char2id.py
    │   ├── creat_batch_data.py
    │   ├── creat_batch_seg.py
    │   ├── embed2ndarray.py
    │   ├── question_and_topic_2id.py
    │   ├── run_all_data_process.sh
    │   ├── test.py
    │   └── word2id.py
    └── models/
        ├── wd_1_1_cnn_concat/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_1_2_cnn_max/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_2_hcnn/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_3_bigru/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_4_han/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_5_bigru_cnn/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        └── wd_6_rcnn/
            ├── __init__.py
            ├── network.py
            ├── predict.py
            └── train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: ReadMe.md
================================================
# 竞赛列表
+ [2017 知乎看山杯机器学习挑战赛](https://www.biendata.com/competition/zhihu/)


================================================
FILE: zhihu-text-classification-master/data_process/.idea/.name
================================================
data_process

================================================
FILE: zhihu-text-classification-master/data_process/.idea/data_process.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">
    <option name="projectConfiguration" value="Nosetests" />
    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
  </component>
</module>

================================================
FILE: zhihu-text-classification-master/data_process/.idea/deployment.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="PublishConfigData">
    <serverData>
      <paths name="Copy of project-level server 'model'">
        <serverdata>
          <mappings>
            <mapping local="$PROJECT_DIR$" web="/" />
          </mappings>
        </serverdata>
      </paths>
    </serverData>
  </component>
</project>

================================================
FILE: zhihu-text-classification-master/data_process/.idea/encodings.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="Encoding" native2AsciiForPropertiesFiles="true" defaultCharsetForPropertiesFiles="UTF-8">
    <file url="PROJECT" charset="UTF-8" />
  </component>
</project>

================================================
FILE: zhihu-text-classification-master/data_process/.idea/misc.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
    <OptionsSetting value="true" id="Add" />
    <OptionsSetting value="true" id="Remove" />
    <OptionsSetting value="true" id="Checkout" />
    <OptionsSetting value="true" id="Update" />
    <OptionsSetting value="true" id="Status" />
    <OptionsSetting value="true" id="Edit" />
    <ConfirmationsSetting value="0" id="Add" />
    <ConfirmationsSetting value="0" id="Remove" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (C:\Python27\python.exe)" project-jdk-type="Python SDK" />
</project>

================================================
FILE: zhihu-text-classification-master/data_process/.idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/data_process.iml" filepath="$PROJECT_DIR$/.idea/data_process.iml" />
    </modules>
  </component>
</project>

================================================
FILE: zhihu-text-classification-master/data_process/.idea/workspace.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ChangeListManager">
    <list default="true" id="52869793-115f-4122-a848-13fbfe0ca81e" name="Default" comment="" />
    <ignored path="data_process.iws" />
    <ignored path=".idea/workspace.xml" />
    <ignored path=".idea/dataSources.local.xml" />
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="TRACKING_ENABLED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
    <option name="LAST_RESOLUTION" value="IGNORE" />
  </component>
  <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
  <component name="CreatePatchCommitExecutor">
    <option name="PATCH_PATH" value="" />
  </component>
  <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
  <component name="FavoritesManager">
    <favorites_list name="data_process" />
  </component>
  <component name="FileEditorManager">
    <leaf>
      <file leaf-file-name="README.md" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/README.md">
          <provider selected="true" editor-type-id="split-provider[text-editor;MarkdownPreviewEditor]">
            <state split_layout="FIRST">
              <first_editor vertical-scroll-proportion="0.27194068">
                <caret line="10" column="48" selection-start-line="9" selection-start-column="29" selection-end-line="10" selection-end-column="48" />
                <folding />
              </first_editor>
              <second_editor />
            </state>
          </provider>
          <provider editor-type-id="MarkdownFxPreviewEditor">
            <state />
          </provider>
        </entry>
      </file>
      <file leaf-file-name="char2id.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/char2id.py">
          <provider selected="true" editor-type-id="text-editor">
            <state vertical-scroll-proportion="0.0">
              <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="word2id.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/word2id.py">
          <provider selected="true" editor-type-id="text-editor">
            <state vertical-scroll-proportion="0.0">
              <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="creat_batch_seg.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/creat_batch_seg.py">
          <provider selected="true" editor-type-id="text-editor">
            <state vertical-scroll-proportion="0.0">
              <caret line="31" column="0" selection-start-line="31" selection-start-column="0" selection-end-line="31" selection-end-column="0" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="question_and_topic_2id.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/question_and_topic_2id.py">
          <provider selected="true" editor-type-id="text-editor">
            <state vertical-scroll-proportion="0.0">
              <caret line="10" column="35" selection-start-line="10" selection-start-column="35" selection-end-line="10" selection-end-column="35" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
    </leaf>
  </component>
  <component name="IdeDocumentHistory">
    <option name="CHANGED_PATHS">
      <list>
        <option value="$PROJECT_DIR$/embed2np.py" />
        <option value="$PROJECT_DIR$/creat_batch_data.py" />
        <option value="$PROJECT_DIR$/creat_batch_seg.py" />
        <option value="$PROJECT_DIR$/README.md" />
      </list>
    </option>
  </component>
  <component name="JsBuildToolGruntFileManager" detection-done="true" />
  <component name="JsBuildToolPackageJson" detection-done="true" />
  <component name="JsGulpfileManager">
    <detection-done>true</detection-done>
  </component>
  <component name="ProjectFrameBounds">
    <option name="x" value="-8" />
    <option name="y" value="-8" />
    <option name="width" value="1696" />
    <option name="height" value="1026" />
  </component>
  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
    <OptionsSetting value="true" id="Add" />
    <OptionsSetting value="true" id="Remove" />
    <OptionsSetting value="true" id="Checkout" />
    <OptionsSetting value="true" id="Update" />
    <OptionsSetting value="true" id="Status" />
    <OptionsSetting value="true" id="Edit" />
    <ConfirmationsSetting value="0" id="Add" />
    <ConfirmationsSetting value="0" id="Remove" />
  </component>
  <component name="ProjectView">
    <navigator currentView="ProjectPane" proportions="" version="1">
      <flattenPackages />
      <showMembers />
      <showModules />
      <showLibraryContents />
      <hideEmptyPackages />
      <abbreviatePackageNames />
      <autoscrollToSource />
      <autoscrollFromSource />
      <sortByType />
      <manualOrder />
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
      <pane id="ProjectPane">
        <subPane>
          <PATH>
            <PATH_ELEMENT>
              <option name="myItemId" value="data_process" />
              <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
            </PATH_ELEMENT>
          </PATH>
        </subPane>
      </pane>
      <pane id="Scope" />
      <pane id="Scratches" />
    </panes>
  </component>
  <component name="PropertiesComponent">
    <property name="settings.editor.selected.configurable" value="File.Encoding" />
    <property name="settings.editor.splitter.proportion" value="0.2" />
    <property name="WebServerToolWindowFactoryState" value="true" />
  </component>
  <component name="RunManager">
    <configuration default="true" type="DjangoTestsConfigurationType" factoryName="Django tests">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="TARGET" value="" />
      <option name="SETTINGS_FILE" value="" />
      <option name="CUSTOM_SETTINGS" value="false" />
      <option name="USE_OPTIONS" value="false" />
      <option name="OPTIONS" value="" />
      <method />
    </configuration>
    <configuration default="true" type="JavascriptDebugType" factoryName="JavaScript Debug">
      <method />
    </configuration>
    <configuration default="true" type="PyBehaveRunConfigurationType" factoryName="Behave">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="ADDITIONAL_ARGS" value="" />
      <method />
    </configuration>
    <configuration default="true" type="PyLettuceRunConfigurationType" factoryName="Lettuce">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="ADDITIONAL_ARGS" value="" />
      <method />
    </configuration>
    <configuration default="true" type="PythonConfigurationType" factoryName="Python">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <method />
    </configuration>
    <configuration default="true" type="js.build_tools.gulp" factoryName="Gulp.js">
      <method />
    </configuration>
    <configuration default="true" type="js.build_tools.npm" factoryName="npm">
      <command value="run-script" />
      <scripts />
      <envs />
      <method />
    </configuration>
    <configuration default="true" type="tests" factoryName="Attests">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="CLASS_NAME" value="" />
      <option name="METHOD_NAME" value="" />
      <option name="FOLDER_NAME" value="" />
      <option name="TEST_TYPE" value="TEST_SCRIPT" />
      <option name="PATTERN" value="" />
      <option name="USE_PATTERN" value="false" />
      <method />
    </configuration>
    <configuration default="true" type="tests" factoryName="Doctests">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="CLASS_NAME" value="" />
      <option name="METHOD_NAME" value="" />
      <option name="FOLDER_NAME" value="" />
      <option name="TEST_TYPE" value="TEST_SCRIPT" />
      <option name="PATTERN" value="" />
      <option name="USE_PATTERN" value="false" />
      <method />
    </configuration>
    <configuration default="true" type="tests" factoryName="Nosetests">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="CLASS_NAME" value="" />
      <option name="METHOD_NAME" value="" />
      <option name="FOLDER_NAME" value="" />
      <option name="TEST_TYPE" value="TEST_SCRIPT" />
      <option name="PATTERN" value="" />
      <option name="USE_PATTERN" value="false" />
      <option name="PARAMS" value="" />
      <option name="USE_PARAM" value="false" />
      <method />
    </configuration>
    <configuration default="true" type="tests" factoryName="Unittests">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="CLASS_NAME" value="" />
      <option name="METHOD_NAME" value="" />
      <option name="FOLDER_NAME" value="" />
      <option name="TEST_TYPE" value="TEST_SCRIPT" />
      <option name="PATTERN" value="" />
      <option name="USE_PATTERN" value="false" />
      <option name="PUREUNITTEST" value="true" />
      <option name="PARAMS" value="" />
      <option name="USE_PARAM" value="false" />
      <method />
    </configuration>
    <configuration default="true" type="tests" factoryName="py.test">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs />
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="" />
      <option name="IS_MODULE_SDK" value="false" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="data_process" />
      <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
      <option name="SCRIPT_NAME" value="" />
      <option name="CLASS_NAME" value="" />
      <option name="METHOD_NAME" value="" />
      <option name="FOLDER_NAME" value="" />
      <option name="TEST_TYPE" value="TEST_SCRIPT" />
      <option name="PATTERN" value="" />
      <option name="USE_PATTERN" value="false" />
      <option name="testToRun" value="" />
      <option name="keywords" value="" />
      <option name="params" value="" />
      <option name="USE_PARAM" value="false" />
      <option name="USE_KEYWORD" value="false" />
      <method />
    </configuration>
  </component>
  <component name="ShelveChangesManager" show_recycled="false" />
  <component name="TaskManager">
    <task active="true" id="Default" summary="Default task">
      <changelist id="52869793-115f-4122-a848-13fbfe0ca81e" name="Default" comment="" />
      <created>1504013687331</created>
      <option name="number" value="Default" />
      <updated>1504013687331</updated>
    </task>
    <servers />
  </component>
  <component name="ToolWindowManager">
    <frame x="-8" y="-8" width="1696" height="1026" extended-state="0" />
    <editor active="true" />
    <layout>
      <window_info id="Remote Host" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25305623" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
      <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
      <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
      <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
      <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
      <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
      <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
      <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
      <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
    </layout>
  </component>
  <component name="VcsContentAnnotationSettings">
    <option name="myLimit" value="2678400000" />
  </component>
  <component name="XDebuggerManager">
    <breakpoint-manager />
    <watches-manager />
  </component>
  <component name="editorHistoryManager">
    <entry file="file://$PROJECT_DIR$/embed2ndarray.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.0">
          <caret line="61" column="37" selection-start-line="61" selection-start-column="37" selection-end-line="61" selection-end-column="37" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/word2id.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.0">
          <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/char2id.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.0">
          <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/question_and_topic_2id.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.0">
          <caret line="10" column="35" selection-start-line="10" selection-start-column="35" selection-end-line="10" selection-end-column="35" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/creat_batch_seg.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.0">
          <caret line="31" column="0" selection-start-line="31" selection-start-column="0" selection-end-line="31" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/creat_batch_data.py">
      <provider selected="true" editor-type-id="text-editor">
        <state vertical-scroll-proportion="0.13237064">
          <caret line="13" column="22" selection-start-line="13" selection-start-column="22" selection-end-line="13" selection-end-column="22" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/README.md">
      <provider selected="true" editor-type-id="split-provider[text-editor;MarkdownPreviewEditor]">
        <state split_layout="FIRST">
          <first_editor vertical-scroll-proportion="0.27194068">
            <caret line="10" column="48" selection-start-line="9" selection-start-column="29" selection-end-line="10" selection-end-column="48" />
            <folding />
          </first_editor>
          <second_editor />
        </state>
      </provider>
      <provider editor-type-id="MarkdownFxPreviewEditor">
        <state />
      </provider>
    </entry>
  </component>
</project>

================================================
FILE: zhihu-text-classification-master/data_process/README.md
================================================
## 数据处理

1.把比赛提供的所有数据解压到 raw_data/ 目录下。<br/>
2.按照顺序依次执行各个 .py，不带任何参数。<br/>
  或者在当前目录下输入下面命令运行所有文件：<br/>
  dos2unix run_all_data_process.sh   # 使用cygwin工具dos2unix将script改为unix格式<br/>
  sh run_all_data_process.sh<br/>
3.环境依赖(下面是我使用的版本) <br/>
- numpy		1.12.1
- pandas 	0.19.2
- word2vec	0.9.1
- tqdm		4.11.2


### embed2ndarray.py
赛方提供了txt格式的词向量和字向量，这里把embedding矩阵转成 np.ndarray 形式，分别保存为 data/word_embedding.npy 和 data/char_embedding.npy。在赛方提供的词向量基础上，添加 '\<PAD\>' 和 '\<UNK\>' 两个特殊符号。其中 '\<PAD\>' 用于将序列补全到固定长度， '\<UNK\>' 用于替换低频词（字）。
用 pd.Series 保存词(字)对应 embedding 中的行号(id),存储在 data/sr_word2id.pkl 和 data/sr_char2id.pkl 中。

### question_and_topic_2id.py
把问题和话题转为id形式，保存在 data/sr_question2id.pkl 和 data/sr_id2question.pkl 中。

### char2id.py
利用上面得到的 sr_char2id，把所有问题的字转为对应的id, 存储为
data/ch_train_title.npy
data/ch_train_content.npy
data/ch_eval_title.npy
data/ch_eval_content.npy

### word2id.py
同 char2id.py

### creat_batch_data.py
把所有的数据按照 batch_size(128) 进行打包，固定seed，随机取 10 万样本作为验证集。每个batch存储为一个 npz 文件，包括 X, y 两部分。
这里所有的序列都进行了截断，长度不足的用0进行padding到固定长度。
保存位置：
wd_train_path = '../data/wd-data/data_train/'
wd_valid_path = '../data/wd-data/data_valid/'
wd_test_path = '../data/wd-data/data_test/'
ch_train_path = '../data/ch-data/data_train/'
ch_valid_path = '../data/ch-data/data_valid/'
ch_test_path = '../data/ch-data/data_test/'


### creat_batch_seg.py
和 creat_batch_data.py 相同，只是对 content 部分进行句子划分。用于分层模型。
划分句子长度：
wd_title_len = 30, wd_sent_len = 30, wd_doc_len = 10.(即content划分为10个句子，每个句子长度为30个词)
ch_title_len = 52, ch_sent_len = 52, ch_doc_len = 10.
不划分句子：
wd_title_len = 30, wd_content_len = 150.
ch_title_len = 52, ch_content_len = 300.


### To do
- 在数据读取中使用 tfrecord 文件进行数据读取。这样能够随时改变 batch_size， 而且 shuffle 会比使用 numpy 更加均匀。
- 添加序列长度信息。在这里所有的序列都截断或者padding为固定长度，在误差计算中没有处理padding部分，可能会使准确率下降。在使用 dynamic_rnn 的时候加上 sequence_length 信息，在计算的时候忽略 padding 部分。同时结合 tf.train.SequenceExample() 和 tf.train.batch() 自动 padding，也可以减少数据量。

================================================
FILE: zhihu-text-classification-master/data_process/char2id.py
================================================
# -*- coding:utf-8 -*-

from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import pickle
from multiprocessing import Pool
from tqdm import tqdm
import time


save_path = '../data/'
with open(save_path + 'sr_char2id.pkl', 'rb') as inp:
    sr_id2char = pickle.load(inp)
    sr_char2id = pickle.load(inp)
dict_char2id = dict()
for i in range(len(sr_char2id)):
    dict_char2id[sr_char2id.index[i]] = sr_char2id.values[i]


def get_id(char):
    """获取 char 所对应的 id.
    如果该字不在字典中，用1进行替换。
    """
    if char not in dict_char2id:
        return 1
    else:
        return dict_char2id[char]


def get_id4chars(chars):
    """把 chars 转为 对应的 id"""
    chars = chars.strip().split(',')  # 先分开字
    ids = list(map(get_id, chars))          # 获取id
    return ids


def test_char2id():
    """把测试集的所有字转成对应的id。"""
    time0 = time.time()
    print('Processing eval data.')
    df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t',  usecols=[0, 1, 3],
                          names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
    print('test question number %d' % len(df_eval))
    # 没有 title 的问题用 content 来替换
    na_title_indexs = list()
    for i in range(len(df_eval)):
        char_title = df_eval.char_title.values[i]
        if type(char_title) is float:
            na_title_indexs.append(i)
    print('There are %d test questions without title.' % len(na_title_indexs))
    for na_index in na_title_indexs:
        df_eval.at[na_index, 'char_title'] = df_eval.at[na_index, 'char_content']
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(range(len(df_eval))):
        char_content = df_eval.char_content.values[i]
        if type(char_content) is float:
            na_content_indexs.append(i)
    print('There are %d test questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_eval.at[na_index, 'char_content'] = df_eval.at[na_index, 'char_title']
    # 转为 id 形式
    p = Pool()
    eval_title = np.asarray(p.map(get_id4chars, df_eval.char_title.values))
    np.save('../data/ch_eval_title.npy', eval_title)
    eval_content = np.asarray(p.map(get_id4chars, df_eval.char_content.values))
    np.save('../data/ch_eval_content.npy', eval_content)
    p.close()
    p.join()
    print('Finished changing the eval chars to ids. Costed time %g s' % (time.time()-time0))


def train_char2id():
    """把训练集的所有字转成对应的id。"""
    time0 = time.time()
    print('Processing train data.')
    df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 1, 3],
                           names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
    print('training question number %d ' % len(df_train))
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(range(len(df_train))):
        char_content = df_train.char_content.values[i]
        if type(char_content) is float:
            na_content_indexs.append(i)
    print('There are %d train questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_train.at[na_index, 'char_content'] = df_train.at[na_index, 'char_title']
    # 没有 title 的问题， 与词一样丢弃下面样本
    na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
              1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
    for i in range(len(df_train)):
        char_title = df_train.char_title.values[i]
        if type(char_title) is float:
            na_title_indexs.append(i)
    print('There are %d train questions without title.' % len(na_title_indexs))
    df_train = df_train.drop(na_title_indexs)
    print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
    # 转为 id 形式
    p = Pool()
    train_title = np.asarray(list(p.map(get_id4chars, df_train.char_title.values)))
    np.save('../data/ch_train_title.npy', train_title)
    train_content = np.asarray(p.map(get_id4chars, df_train.char_content.values))
    np.save('../data/ch_train_content.npy', train_content)
    p.close()
    p.join()
    print('Finished changing the training chars to ids. Costed time %g s' % (time.time() - time0))


if __name__ == '__main__':
    test_char2id()
    train_char2id()








================================================
FILE: zhihu-text-classification-master/data_process/creat_batch_data.py
================================================
# -*- coding:utf-8 -*-

from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import pickle
from multiprocessing import Pool
import sys
import os

sys.path.append('../')
from data_helpers import pad_X30
from data_helpers import pad_X150
from data_helpers import pad_X52
from data_helpers import pad_X300
from data_helpers import train_batch
from data_helpers import eval_batch

""" 把所有的数据按照 batch_size(128) 进行打包。取 10万 样本作为验证集。
word_title_len = 30.
word_content_len = 150.
char_title_len = 52.
char_content_len = 300.
"""


wd_train_path = '../data/wd-data/data_train/'
wd_valid_path = '../data/wd-data/data_valid/'
wd_test_path = '../data/wd-data/data_test/'
ch_train_path = '../data/ch-data/data_train/'
ch_valid_path = '../data/ch-data/data_valid/'
ch_test_path = '../data/ch-data/data_test/'
paths = [wd_train_path, wd_valid_path, wd_test_path,
         ch_train_path, ch_valid_path, ch_test_path]
for each in paths:
    if not os.path.exists(each):
        os.makedirs(each)

with open('../data/sr_topic2id.pkl', 'rb') as inp:
    sr_topic2id = pickle.load(inp)

dict_topic2id = dict()
for i in range(len(sr_topic2id)):
    dict_topic2id[sr_topic2id.index[i]] = sr_topic2id.values[i]


def topics2ids(topics):
    """把 chars 转为 对应的 id"""
    topics = topics.split(',')
    ids = list(map(lambda topic: dict_topic2id[topic], topics))         # 获取id
    return ids


def get_lables():
    """获取训练集所有样本的标签。注意之前在处理数据时丢弃了部分没有 title 的样本。"""
    df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t',
                                    names=['questions', 'topics'], dtype={'questions': object, 'topics': object})
    na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
                       1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
    df_question_topic = df_question_topic.drop(na_title_indexs)
    p = Pool()
    y = p.map(topics2ids, df_question_topic.topics.values)
    p.close()
    p.join()
    return np.asarray(y)


# word 数据打包
def wd_train_get_batch(title_len=30, content_len=150, batch_size=128):
    print('loading word train_title and train_content.')
    train_title = np.load('../data/wd_train_title.npy')
    train_content = np.load('../data/wd_train_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X30, train_title))
    X_content = np.asarray(p.map(pad_X150, train_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    print('getting labels, this should cost minutes, please wait.')
    y = get_lables()
    print('y.shape=', y.shape)
    np.save('../data/y_tr.npy', y)
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    print('creating batch data.')
    # 验证集打batch
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, wd_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, wd_train_path, batch_size)


def wd_test_get_batch(title_len=30, content_len=150, batch_size=128):
    eval_title = np.load('../data/wd_eval_title.npy')
    eval_content = np.load('../data/wd_eval_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X30, eval_title))
    X_content = np.asarray(p.map(pad_X150, eval_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    sample_num = len(X)
    print('eval_sample_num=%d' % sample_num)
    eval_batch(X, wd_test_path, batch_size)


# char 数据打包
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
    print('loading char train_title and train_content.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, train_title))
    X_content = np.asarray(p.map(pad_X300, train_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)


def ch_test_get_batch(title_len=52, content_len=300, batch_size=128):
    eval_title = np.load('../data/ch_eval_title.npy')
    eval_content = np.load('../data/ch_eval_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, eval_title))
    X_content = np.asarray(p.map(pad_X300, eval_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    sample_num = len(X)
    print('eval_sample_num=%d' % sample_num)
    eval_batch(X, ch_test_path, batch_size)


if __name__ == '__main__':
    wd_train_get_batch()
    wd_test_get_batch()
    ch_train_get_batch()
    ch_test_get_batch()


================================================
FILE: zhihu-text-classification-master/data_process/creat_batch_seg.py
================================================
# -*- coding:utf-8 -*-

from __future__ import division
from __future__ import print_function

import numpy as np
from multiprocessing import Pool
import sys
import os

sys.path.append('../')
from data_helpers import pad_X30
from data_helpers import pad_X52
from data_helpers import wd_pad_cut_docs
from data_helpers import ch_pad_cut_docs
from data_helpers import train_batch
from data_helpers import eval_batch


wd_train_path = '../data/wd-data/seg_train/'
wd_valid_path = '../data/wd-data/seg_valid/'
wd_test_path = '../data/wd-data/seg_test/'
ch_train_path = '../data/ch-data/seg_train/'
ch_valid_path = '../data/ch-data/seg_valid/'
ch_test_path = '../data/ch-data/seg_test/'
paths = [wd_train_path, wd_valid_path, wd_test_path,
         ch_train_path, ch_valid_path, ch_test_path]
for each in paths:
    if not os.path.exists(each):
        os.makedirs(each)


# word 数据打包
def wd_train_get_batch(title_len=30, batch_size=128):
    print('loading word train_title and train_content, this should cost minutes, please wait.')
    train_title = np.load('../data/wd_train_title.npy')
    train_content = np.load('../data/wd_train_content.npy')
    p = Pool(6)
    X_title = np.asarray(p.map(pad_X30, train_title))
    X_content = np.asarray(p.map(wd_pad_cut_docs, train_content))
    p.close()
    p.join()
    X_content.shape = [-1, 30*10]
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打 batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, wd_valid_path, batch_size)
    # 训练集打 batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, wd_train_path, batch_size)


def wd_test_get_batch(title_len=30, batch_size=128):
    print('loading word eval_title and eval_content.')
    eval_title = np.load('../data/wd_eval_title.npy')
    eval_content = np.load('../data/wd_eval_content.npy')
    p = Pool(6)
    X_title = np.asarray(p.map(pad_X30, eval_title))
    X_content = np.asarray(p.map(wd_pad_cut_docs, eval_content))
    p.close()
    p.join()
    X_content.shape = [-1, 30*10]
    X = np.hstack([X_title, X_content])
    sample_num = len(X)
    print('eval_sample_num=%d' % sample_num)
    eval_batch(X, wd_test_path, batch_size)


# char 数据打包
def ch_train_get_batch(title_len=52, batch_size=128):
    print('loading char train_title and train_content, this should cost minutes, please wait.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    p = Pool(8)
    X_title = np.asarray(p.map(pad_X52, train_title))
    X_content = np.asarray(p.map(ch_pad_cut_docs, train_content))
    p.close()
    p.join()
    X_content.shape = [-1, 52*10]
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)


def ch_test_get_batch(title_len=52, batch_size=128):
    print('loading char eval_title and eval_content.')
    eval_title = np.load('../data/ch_eval_title.npy')
    eval_content = np.load('../data/ch_eval_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, eval_title))
    X_content = np.asarray(p.map(ch_pad_cut_docs, eval_content))
    p.close()
    p.join()
    X_content.shape = [-1, 52*10]
    X = np.hstack([X_title, X_content])
    sample_num = len(X)
    print('eval_sample_num=%d' % sample_num)
    eval_batch(X, ch_test_path, batch_size)


if __name__ == '__main__':
    wd_train_get_batch()
    wd_test_get_batch()
    ch_train_get_batch()
    ch_test_get_batch()


================================================
FILE: zhihu-text-classification-master/data_process/embed2ndarray.py
================================================
# -*- coding:utf-8 -*- 

from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import word2vec
import pickle
import os

SPECIAL_SYMBOL = ['<PAD>', '<EOS>']  # add these special symbols to word(char) embeddings.


def get_word_embedding():
    """提取词向量，并保存至 ../data/word_embedding.npy"""
    print('getting the word_embedding.npy')
    wv = word2vec.load('../raw_data/word_embedding.txt')
    word_embedding = wv.vectors
    words = wv.vocab
    n_special_sym = len(SPECIAL_SYMBOL)
    sr_id2word = pd.Series(words, index=range(n_special_sym, n_special_sym + len(words)))
    sr_word2id = pd.Series(range(n_special_sym, n_special_sym + len(words)), index=words)
    # 添加特殊符号：<PAD>:0, <UNK>:1
    embedding_size = 256
    vec_special_sym = np.random.randn(n_special_sym, embedding_size)
    for i in range(n_special_sym):
        sr_id2word[i] = SPECIAL_SYMBOL[i]
        sr_word2id[SPECIAL_SYMBOL[i]] = i
    word_embedding = np.vstack([vec_special_sym, word_embedding])
    # 保存词向量
    save_path = '../data/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    np.save(save_path + 'word_embedding.npy', word_embedding)
    # 保存词与id的对应关系
    with open(save_path + 'sr_word2id.pkl', 'wb') as outp:
        pickle.dump(sr_id2word, outp)
        pickle.dump(sr_word2id, outp)
    print('Saving the word_embedding.npy to ../data/word_embedding.npy')


def get_char_embedding():
    """提取字向量，并保存至 ../data/char_embedding.npy"""
    print('getting the char_embedding.npy')
    wv = word2vec.load('../raw_data/char_embedding.txt')
    char_embedding = wv.vectors
    chars = wv.vocab
    n_special_sym = len(SPECIAL_SYMBOL)
    sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars)))
    sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars)

    # 添加特殊符号：<PAD>:0, <UNK>:1
    embedding_size = 256

    vec_special_sym = np.random.randn(n_special_sym, embedding_size)
    for i in range(n_special_sym):
        sr_id2char[i] = SPECIAL_SYMBOL[i]
        sr_char2id[SPECIAL_SYMBOL[i]] = i
    char_embedding = np.vstack([vec_special_sym, char_embedding])
    # 保存字向量
    save_path = '../data/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    np.save(save_path + 'char_embedding.npy', char_embedding)
    # 保存字与id的对应关系
    with open(save_path + 'sr_char2id.pkl', 'wb') as outp:
        pickle.dump(sr_id2char, outp)
        pickle.dump(sr_char2id, outp)
    print('Saving the char_embedding.npy to ../data/char_embedding.npy')


if __name__ == '__main__':
    get_word_embedding()
    get_char_embedding()


================================================
FILE: zhihu-text-classification-master/data_process/question_and_topic_2id.py
================================================
# -*- coding:utf-8 -*- 

import pandas as pd
import pickle
from itertools import chain


def question_and_topic_2id():
    """把question和topic转成id形式并保存至 ../data/目录下。"""
    print('Changing the quetion and topic to id and save in sr_question2.pkl and sr_topic2id.pkl in ../data/')
    df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t', names=['question', 'topics'],
                        dtype={'question': object, 'topics': object})
    df_question_topic.topics = df_question_topic.topics.apply(lambda tps: tps.split(','))
    save_path = '../data/'
    print('questino number = %d ' % len(df_question_topic))
    # 问题 id 按照给出的问题顺序编号
    questions = df_question_topic.question.values
    sr_question2id = pd.Series(range(len(questions)), index=questions) 
    sr_id2question = pd.Series(questions, index=range(len(questions)))
    # topic 按照数量从大到小进行编号
    topics = df_question_topic.topics.values
    topics = list(chain(*topics))
    sr_topics = pd.Series(topics)
    topics_count = sr_topics.value_counts()
    topics = topics_count.index
    sr_topic2id = pd.Series(range(len(topics)),index=topics)
    sr_id2topic = pd.Series(topics, index=range(len(topics))) 

    with open(save_path + 'sr_question2id.pkl', 'wb') as outp:
        pickle.dump(sr_question2id, outp)
        pickle.dump(sr_id2question, outp)
    with open(save_path + 'sr_topic2id.pkl', 'wb') as outp:
        pickle.dump(sr_topic2id, outp)
        pickle.dump(sr_id2topic, outp)
    print('Finished changing.')


if __name__ == '__main__':
    question_and_topic_2id()


================================================
FILE: zhihu-text-classification-master/data_process/run_all_data_process.sh
================================================
#!/usr/bin/env bash
echo -e "\033[44;37;5m RUNNING embed2ndarray.py\033[0m ";
python embed2ndarray.py;
echo -e "\033[44;37;5m RUNNING question_and_topic_2id.py\033[0m ";
python question_and_topic_2id.py;
echo -e "\033[44;37;5m RUNNING char2id.py\033[0m ";
python char2id.py;
echo -e "\033[44;37;5m RUNNING word2id.py\033[0m ";
python word2id.py;
echo -e "\033[44;37;5m RUNNING creat_batch_data.py\033[0m ";
python creat_batch_data.py;
echo -e "\033[44;37;5m RUNNING creat_batch_seg.py\033[0m ";
python creat_batch_seg.py;

================================================
FILE: zhihu-text-classification-master/data_process/test.py
================================================
# -*- coding:utf-8 -*-


from multiprocessing import Pool
import numpy as np

def func(a, b):
    return a+b

p = Pool()
a = [1,2,3]
b = [4,5,6]
para = zip(a,b)
result = p.map(func, para)
p.close()
p.join()
print result

================================================
FILE: zhihu-text-classification-master/data_process/word2id.py
================================================
# -*- coding:utf-8 -*-

from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import pickle
from multiprocessing import Pool
from tqdm import tqdm
import time

save_path = '../data/'
with open(save_path + 'sr_word2id.pkl', 'rb') as inp:
    sr_id2word = pickle.load(inp)
    sr_word2id = pickle.load(inp)
dict_word2id = dict()
for i in range(len(sr_word2id)):
    dict_word2id[sr_word2id.index[i]] = sr_word2id.values[i]


def get_id(word):
    """获取 word 所对应的 id.
    如果该词不在词典中，用 <UNK>（对应的 ID 为 1 ）进行替换。
    """
    if word not in dict_word2id:
        return 1
    else:
        return dict_word2id[word]


def get_id4words(words):
    """把 words 转为 对应的 id"""
    words = words.strip().split(',')  # 先分开词
    ids = list(map(get_id, words))  # 获取id
    return ids


def test_word2id():
    """把测试集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing eval data.')
    df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
                          names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('test question number %d' % len(df_eval))
    # 没有 title 的问题用 content 来替换
    na_title_indexs = list()
    for i in range(len(df_eval)):
        word_title = df_eval.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d test questions without title.' % len(na_title_indexs))
    for na_index in na_title_indexs:
        df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(range(len(df_eval))):
        word_content = df_eval.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d test questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
    # 转为 id 形式
    p = Pool()
    eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
    np.save('../data/wd_eval_title.npy', eval_title)
    eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
    np.save('../data/wd_eval_content.npy', eval_content)
    p.close()
    p.join()
    print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))


def train_word2id():
    """把训练集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing train data.')
    df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
                           names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('training question number %d ' % len(df_train))
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(range(len(df_train))):
        word_content = df_train.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d train questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
    # 没有 title 的问题， 丢弃
    na_title_indexs = list()
    for i in range(len(df_train)):
        word_title = df_train.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d train questions without title.' % len(na_title_indexs))
    df_train = df_train.drop(na_title_indexs)
    print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
    # 转为 id 形式
    p = Pool()
    train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
    np.save('../data/wd_train_title.npy', train_title)
    train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
    np.save('../data/wd_train_content.npy', train_content)
    p.close()
    p.join()
    print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))


if __name__ == '__main__':
    test_word2id()
    train_word2id()


================================================
FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf

"""wd_1_1_cnn_concat
title 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出直接 concat。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_1_1_cnn_concat'
        self.title_len = 30
        self.content_len = 150
        self.filter_sizes = [2, 3, 4, 5, 7]
        self.n_filter = 256
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class TextCNN(object):
    """
    title: inputs->textcnn->output_title
    content: inputs->textcnn->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.title_len = settings.title_len
        self.content_len = settings.content_len
        self.filter_sizes = settings.filter_sizes
        self.n_filter = settings.n_filter
        self.n_filter_total = self.n_filter * len(self.filter_sizes)
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('cnn_text'):
            output_title = self.cnn_inference(self._X1_inputs, self.title_len)

        with tf.variable_scope('hcnn_content'):
            output_content = self.cnn_inference(self._X2_inputs, self.content_len)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            W_fc = self.weight_variable([self.n_filter_total * 2, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=2)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
                                                           self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def cnn_inference(self, X_inputs, n_step):
        """TextCNN 模型。
        Args:
            X_inputs: tensor.shape=(batch_size, n_step)
        Returns:
            title_outputs: tensor.shape=(batch_size, self.n_filter_total)
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        inputs = tf.expand_dims(inputs, -1)
        pooled_outputs = list()
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
                W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
                beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
                tf.summary.histogram('beta', beta)
                conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
                # Apply nonlinearity, batch norm scaling is not useful with relus
                # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
                h = tf.nn.relu(conv_bn, name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
                                        strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)
                self.update_emas.append(update_ema)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
        return h_pool_flat  # shape = [batch_size, self.n_filter_total]


# test the model
# def test():
#     import numpy as np
#     print('Begin testing...')
#     settings = Settings()
#     W_embedding = np.random.randn(50, 10)
#     config = tf.ConfigProto()
#     config.gpu_options.allow_growth = True
#     batch_size = 128
#     with tf.Session(config=config) as sess:
#         model = TextCNN(W_embedding, settings)
#         optimizer = tf.train.AdamOptimizer(0.001)
#         train_op = optimizer.minimize(model.loss)
#         update_op = tf.group(*model.update_emas)
#         sess.run(tf.global_variables_initializer())
#         fetch = [model.loss, model.y_pred, train_op, update_op]
#         loss_list = list()
#         for i in xrange(100):
#             X1_batch = np.zeros((batch_size, 30), dtype=float)
#             X2_batch = np.zeros((batch_size, 150), dtype=float)
#             y_batch = np.zeros((batch_size, 1999), dtype=int)
#             _batch_size = len(y_batch)
#             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
#                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
#             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
#             loss_list.append(loss)
#             print(i, loss)
#
# if __name__ == '__main__':
#     test()


================================================
FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/data_valid/'
data_test_path = '../../data/wd-data/data_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels)  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.TextCNN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式
flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/data_train/'
data_valid_path = '../../data/wd-data/data_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1


def train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(range(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)


def main(_):
    global ckpt_path
    global last_f1
    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
        shutil.rmtree(summary_path)
        os.makedirs(summary_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)

    print('1.Loading data...')
    W_embedding = np.load(embedding_path)
    print('training sample_num = %d' % n_tr_batches)
    print('valid sample_num = %d' % n_va_batches)

    # Initial or restore the model
    print('2.Building model...')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.TextCNN(W_embedding, settings)
        with tf.variable_scope('training_ops') as vs:
            learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
                                                   FLAGS.decay_rate, staircase=True)
            # two optimizer: op1, update embedding; op2, do not update embedding.
            with tf.variable_scope('Optimizer1'):
                tvars1 = tf.trainable_variables()
                train_op1 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars1)

            with tf.variable_scope('Optimizer2'):
                tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
                train_op2 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars2)

            update_op = tf.group(*model.update_emas)
            merged = tf.summary.merge_all()  # summary
            train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(summary_path + 'test')
            training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]

        # 如果已经保存过模型，导入上次的模型
        if os.path.exists(ckpt_path + "checkpoint"):
            print("Restoring Variables from Checkpoint...")
            model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
            last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
            print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
            sess.run(tf.variables_initializer(training_ops))
        else:
            print('Initializing Variables...')
            sess.run(tf.global_variables_initializer())

        print('3.Begin training...')
        print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
        for epoch in range(FLAGS.max_max_epoch):
            global_step = sess.run(model.global_step)
            print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
            if epoch == FLAGS.max_epoch:  # update the embedding
                train_op = train_op1
            else:
                train_op = train_op2

            train_fetches = [merged, model.loss, train_op, update_op]
            valid_fetches = [merged, model.loss]
            train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
        # 最后再做一次验证
        valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
        print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
            sess.run(model.global_step), valid_cost, precision, recall, f1))
        if f1 > last_f1:  # save the better model
            saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
            print('saved new model to %s ' % saving_path)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf

"""wd_1_2_cnn_max
title 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出按位取 max。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_1_2_cnn_max'
        self.title_len = 30
        self.content_len = 150
        self.filter_sizes = [2, 3, 4, 5, 7]
        self.n_filter = 256
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class TextCNN(object):
    """
    title: inputs->textcnn->output_title
    content: inputs->textcnn->output_content
    max[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.title_len = settings.title_len
        self.content_len = settings.content_len
        self.filter_sizes = settings.filter_sizes
        self.n_filter = settings.n_filter
        self.n_filter_total = self.n_filter * len(self.filter_sizes)
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('cnn_text'):
            output_title = self.cnn_inference(self._X1_inputs, self.title_len)
            output_title = tf.expand_dims(output_title, 0)

        with tf.variable_scope('hcnn_content'):
            output_content = self.cnn_inference(self._X2_inputs, self.content_len)
            output_content = tf.expand_dims(output_content, 0)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=0)
            output = tf.reduce_max(output, axis=0)
            W_fc = self.weight_variable([self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=2)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
                                                           self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def cnn_inference(self, X_inputs, n_step):
        """TextCNN 模型。
        Args:
            X_inputs: tensor.shape=(batch_size, n_step)
        Returns:
            title_outputs: tensor.shape=(batch_size, self.n_filter_total)
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        inputs = tf.expand_dims(inputs, -1)
        pooled_outputs = list()
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
                W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
                beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
                tf.summary.histogram('beta', beta)
                conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
                # Apply nonlinearity, batch norm scaling is not useful with relus
                # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
                h = tf.nn.relu(conv_bn, name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
                                        strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)
                self.update_emas.append(update_ema)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
        return h_pool_flat  # shape = [batch_size, self.n_filter_total]


# test the model
# def test():
#     import numpy as np
#     print('Begin testing...')
#     settings = Settings()
#     W_embedding = np.random.randn(50, 10)
#     config = tf.ConfigProto()
#     config.gpu_options.allow_growth = True
#     batch_size = 128
#     with tf.Session(config=config) as sess:
#         model = TextCNN(W_embedding, settings)
#         optimizer = tf.train.AdamOptimizer(0.001)
#         train_op = optimizer.minimize(model.loss)
#         update_op = tf.group(*model.update_emas)
#         sess.run(tf.global_variables_initializer())
#         fetch = [model.loss, model.y_pred, train_op, update_op]
#         loss_list = list()
#         for i in xrange(100):
#             X1_batch = np.zeros((batch_size, 30), dtype=float)
#             X2_batch = np.zeros((batch_size, 150), dtype=float)
#             y_batch = np.zeros((batch_size, 1999), dtype=int)
#             _batch_size = len(y_batch)
#             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
#                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
#             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
#             loss_list.append(loss)
#             print(i, loss)
#
# if __name__ == '__main__':
#     test()


================================================
FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/data_valid/'
data_test_path = '../../data/wd-data/data_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.TextCNN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式

flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.35, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/data_train/'
data_valid_path = '../../data/wd-data/data_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1


def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(range(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)


def main(_):
    global ckpt_path
    global last_f1
    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
        shutil.rmtree(summary_path)
        os.makedirs(summary_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)

    print('1.Loading data...')
    W_embedding = np.load(embedding_path)
    print('training sample_num = %d' % n_tr_batches)
    print('valid sample_num = %d' % n_va_batches)

    # Initial or restore the model
    print('2.Building model...')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.TextCNN(W_embedding, settings)
        with tf.variable_scope('training_ops') as vs:
            learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
                                                   FLAGS.decay_rate, staircase=True)
            # two optimizer: op1, update embedding; op2, do not update embedding.
            with tf.variable_scope('Optimizer1'):
                tvars1 = tf.trainable_variables()
                grads1 = tf.gradients(model.loss, tvars1)
                optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
                                                   global_step=model.global_step)
            with tf.variable_scope('Optimizer2'):
                tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
                grads2 = tf.gradients(model.loss, tvars2)
                optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
                                                   global_step=model.global_step)
            update_op = tf.group(*model.update_emas)
            merged = tf.summary.merge_all()  # summary
            train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(summary_path + 'test')
            training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]

        # 如果已经保存过模型，导入上次的模型
        if os.path.exists(ckpt_path + "checkpoint"):
            print("Restoring Variables from Checkpoint...")
            model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
            last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
            print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
            sess.run(tf.variables_initializer(training_ops))
            train_op2 = train_op1
        else:
            print('Initializing Variables...')
            sess.run(tf.global_variables_initializer())

        print('3.Begin training...')
        print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
        train_op = train_op2
        for epoch in range(FLAGS.max_max_epoch):
            global_step = sess.run(model.global_step)
            print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
            if epoch == FLAGS.max_epoch:  # update the embedding
                train_op = train_op1
            train_fetches = [merged, model.loss, train_op, update_op]
            valid_fetches = [merged, model.loss]
            train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
        # 最后再做一次验证
        valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
        print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
            sess.run(model.global_step), valid_cost, precision, recall, f1))
        if f1 > last_f1:  # save the better model
            saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
            print('saved new model to %s ' % saving_path)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_2_hcnn/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_2_hcnn/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf

"""wd_2_hcnn
title 部分使用 TextCNN；content 部分使用分层的 TextCNN。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_2_hcnn'
        self.title_len = self.sent_len = 30
        self.doc_len = 10
        self.sent_filter_sizes = [2, 3, 4, 5]
        self.doc_filter_sizes = [2, 3, 4]
        self.n_filter = 256
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class HCNN(object):
    """
    title: inputs->textcnn->output_title
    content: inputs->hcnn->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.sent_len = settings.sent_len
        self.doc_len = settings.doc_len
        self.sent_filter_sizes = settings.sent_filter_sizes
        self.doc_filter_sizes = settings.doc_filter_sizes
        self.n_filter = settings.n_filter
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.sent_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('cnn_text'):
            output_title = self.cnn_inference(self._X1_inputs)

        with tf.variable_scope('hcnn_content'):
            output_content = self.hcnn_inference(self._X2_inputs)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            output_size = self.n_filter * (len(self.sent_filter_sizes) + len(self.doc_filter_sizes))
            W_fc = self.weight_variable([output_size, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=2)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
                                                           self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def textcnn(self, X_inputs, n_step, filter_sizes, embed_size):
        """build the TextCNN network.
        n_step: the sentence len."""
        inputs = tf.expand_dims(X_inputs, -1)
        pooled_outputs = list()
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embed_size, 1, self.n_filter]
                W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
                beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
                tf.summary.histogram('beta', beta)
                conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
                # Apply nonlinearity, batch norm scaling is not useful with relus
                # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
                h = tf.nn.relu(conv_bn, name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
                                        strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)
                self.update_emas.append(update_ema)
        h_pool = tf.concat(pooled_outputs, 3)
        n_filter_total = self.n_filter * len(filter_sizes)
        h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total])
        return h_pool_flat  # shape = [-1, n_filter_total]

    def cnn_inference(self, X_inputs):
        """TextCNN 模型。title部分。
        Args:
            X_inputs: tensor.shape=(batch_size, title_len)
        Returns:
            title_outputs: tensor.shape=(batch_size, n_filter*filter_num_sent)
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        with tf.variable_scope('title_encoder'):  # 生成 title 的向量表示
            title_outputs = self.textcnn(inputs, self.sent_len, self.sent_filter_sizes, embed_size=self.embedding_size)
        return title_outputs  # shape = [batch_size, n_filter*filter_num_sent]

    def hcnn_inference(self, X_inputs):
        """分层 TextCNN 模型。content部分。
        Args:
            X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
        Returns:
            doc_attn_outputs: tensor.shape=(batch_size, n_filter*filter_num_doc)
        """
        inputs = tf.nn.embedding_lookup(self.embedding,
                                        X_inputs)  # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
        sent_inputs = tf.reshape(inputs, [self.batch_size * self.doc_len, self.sent_len,
                                          self.embedding_size])  # [batch_size*doc_len, sent_len, embedding_size]
        with tf.variable_scope('sentence_encoder'):  # 生成句向量
            sent_outputs = self.textcnn(sent_inputs, self.sent_len, self.sent_filter_sizes, self.embedding_size)
        with tf.variable_scope('doc_encoder'):  # 生成文档向量
            doc_inputs = tf.reshape(sent_outputs, [self.batch_size, self.doc_len, self.n_filter * len(
                self.sent_filter_sizes)])  # [batch_size, doc_len, n_filter*len(filter_sizes_sent)]
            doc_outputs = self.textcnn(doc_inputs, self.doc_len, self.doc_filter_sizes, self.n_filter * len(
                self.sent_filter_sizes))  # [batch_size, doc_len, n_filter*filter_num_doc]
        return doc_outputs  # [batch_size,  n_filter*len(doc_filter_sizes)]

# test the model
# def test():
#     import numpy as np
#     print('Begin testing...')
#     settings = Settings()
#     W_embedding = np.random.randn(50, 10)
#     config = tf.ConfigProto()
#     config.gpu_options.allow_growth = True
#     batch_size = 128
#     with tf.Session(config=config) as sess:
#         model = HCNN(W_embedding, settings)
#         optimizer = tf.train.AdamOptimizer(0.001)
#         train_op = optimizer.minimize(model.loss)
#         update_op = tf.group(*model.update_emas)
#         sess.run(tf.global_variables_initializer())
#         fetch = [model.loss, model.y_pred, train_op, update_op]
#         loss_list = list()
#         for i in xrange(100):
#             X1_batch = np.zeros((batch_size, 30), dtype=float)
#             X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
#             y_batch = np.zeros((batch_size, 1999), dtype=int)
#             _batch_size = len(y_batch)
#             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
#                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
#             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
#             loss_list.append(loss)
#             print(i, loss)

# test()


================================================
FILE: zhihu-text-classification-master/models/wd_2_hcnn/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/seg_valid/'
data_test_path = '../../data/wd-data/seg_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.HCNN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_2_hcnn/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式
flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/seg_train/'
data_valid_path = '../../data/wd-data/seg_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1


def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(range(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)


def main(_):
    global ckpt_path
    global last_f1
    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
        shutil.rmtree(summary_path)
        os.makedirs(summary_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)

    print('1.Loading data...')
    W_embedding = np.load(embedding_path)
    print('training sample_num = %d' % n_tr_batches)
    print('valid sample_num = %d' % n_va_batches)

    # Initial or restore the model
    print('2.Building model...')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.HCNN(W_embedding, settings)
        with tf.variable_scope('training_ops') as vs:
            learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
                                                   FLAGS.decay_rate, staircase=True)
            # two optimizer: op1, update embedding; op2, do not update embedding.
            with tf.variable_scope('Optimizer1'):
                tvars1 = tf.trainable_variables()
                grads1 = tf.gradients(model.loss, tvars1)
                optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
                                                   global_step=model.global_step)
            with tf.variable_scope('Optimizer2'):
                tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
                grads2 = tf.gradients(model.loss, tvars2)
                optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
                                                   global_step=model.global_step)
            update_op = tf.group(*model.update_emas)
            merged = tf.summary.merge_all()  # summary
            train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(summary_path + 'test')
            training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]

        # 如果已经保存过模型，导入上次的模型
        if os.path.exists(ckpt_path + "checkpoint"):
            print("Restoring Variables from Checkpoint...")
            model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
            last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
            print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
            sess.run(tf.variables_initializer(training_ops))
            train_op2 = train_op1
        else:
            print('Initializing Variables...')
            sess.run(tf.global_variables_initializer())

        print('3.Begin training...')
        print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
        train_op = train_op2
        for epoch in range(FLAGS.max_max_epoch):
            global_step = sess.run(model.global_step)
            print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
            if epoch == FLAGS.max_epoch:  # update the embedding
                train_op = train_op1
            train_fetches = [merged, model.loss, train_op, update_op]
            valid_fetches = [merged, model.loss]
            train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
        # 最后再做一次验证
        valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
        print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
            sess.run(model.global_step), valid_cost, precision, recall, f1))
        if f1 > last_f1:  # save the better model
            saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
            print('saved new model to %s ' % saving_path)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_3_bigru/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_3_bigru/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

"""wd_3_bigru
title 部分使用 bigru+attention；content 部分使用 bigru+attention； 两部分输出直接 concat。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_3_bigru'
        self.title_len = 30
        self.content_len = 150
        self.hidden_size = 256
        self.n_layer = 1
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class BiGRU(object):
    """
    title: inputs->bigru+attention->output_title
    content: inputs->bigru+attention->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.title_len = settings.title_len
        self.content_len = settings.content_len
        self.hidden_size = settings.hidden_size
        self.n_layer = settings.n_layer
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('bigru_text'):
            output_title = self.bigru_inference(self._X1_inputs)

        with tf.variable_scope('bigru_content'):
            output_content = self.bigru_inference(self._X2_inputs)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(self.fc_bn_relu, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=1)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def gru_cell(self):
        with tf.name_scope('gru_cell'):
            cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
        return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

    def bi_gru(self, inputs):
        """build the bi-GRU network. 返回个所有层的隐含状态。"""
        cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
        cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
        initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
        initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
        outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
                                                            initial_states_fw=initial_states_fw,
                                                            initial_states_bw=initial_states_bw, dtype=tf.float32)
        return outputs

    def task_specific_attention(self, inputs, output_size,
                                initializer=layers.xavier_initializer(),
                                activation_fn=tf.tanh, scope=None):
        """
        Performs task-specific attention reduction, using learned
        attention context vector (constant within task of interest).
        Args:
            inputs: Tensor of shape [batch_size, units, input_size]
                `input_size` must be static (known)
                `units` axis will be attended over (reduced from output)
                `batch_size` will be preserved
            output_size: Size of output's inner (feature) dimension
        Returns:
           outputs: Tensor of shape [batch_size, output_dim].
        """
        assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
        with tf.variable_scope(scope or 'attention') as scope:
            # u_w, attention 向量
            attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
                                                       initializer=initializer, dtype=tf.float32)
            # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
            input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
            # 输出 [batch_size, units]
            vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
            attention_weights = tf.nn.softmax(vector_attn, dim=1)
            tf.summary.histogram('attention_weigths', attention_weights)
            weighted_projection = tf.multiply(inputs, attention_weights)
            outputs = tf.reduce_sum(weighted_projection, axis=1)
            return outputs  # 输出 [batch_size, hidden_size*2]

    def bigru_inference(self, X_inputs):
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        output_bigru = self.bi_gru(inputs)
        output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
        return output_att


# test the model
def test():
    import numpy as np
    print('Begin testing...')
    settings = Settings()
    W_embedding = np.random.randn(50, 10)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    batch_size = 128
    with tf.Session(config=config) as sess:
        model = BiGRU(W_embedding, settings)
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.minimize(model.loss)
        update_op = tf.group(*model.update_emas)
        sess.run(tf.global_variables_initializer())
        fetch = [model.loss, model.y_pred, train_op, update_op]
        loss_list = list()
        for i in xrange(100):
            X1_batch = np.zeros((batch_size, 30), dtype=float)
            X2_batch = np.zeros((batch_size, 150), dtype=float)
            y_batch = np.zeros((batch_size, 1999), dtype=int)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
            loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
            loss_list.append(loss)
            print(i, loss)

if __name__ == '__main__':
    test()


================================================
FILE: zhihu-text-classification-master/models/wd_3_bigru/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/data_valid/'
data_test_path = '../../data/wd-data/data_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(range(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(range(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.BiGRU(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_3_bigru/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式
flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/data_train/'
data_valid_path = '../../data/wd-data/data_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1


def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(range(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)


def main(_):
    global ckpt_path
    global last_f1
    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
        shutil.rmtree(summary_path)
        os.makedirs(summary_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)

    print('1.Loading data...')
    W_embedding = np.load(embedding_path)
    print('training sample_num = %d' % n_tr_batches)
    print('valid sample_num = %d' % n_va_batches)

    # Initial or restore the model
    print('2.Building model...')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.BiGRU(W_embedding, settings)
        with tf.variable_scope('training_ops') as vs:
            learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
                                                   FLAGS.decay_rate, staircase=True)
            # two optimizer: op1, update embedding; op2, do not update embedding.
            with tf.variable_scope('Optimizer1'):
                tvars1 = tf.trainable_variables()
                grads1 = tf.gradients(model.loss, tvars1)
                optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
                                                   global_step=model.global_step)
            with tf.variable_scope('Optimizer2'):
                tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
                grads2 = tf.gradients(model.loss, tvars2)
                optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
                                                   global_step=model.global_step)
            update_op = tf.group(*model.update_emas)
            merged = tf.summary.merge_all()  # summary
            train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(summary_path + 'test')
            training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]

        # 如果已经保存过模型，导入上次的模型
        if os.path.exists(ckpt_path + "checkpoint"):
            print("Restoring Variables from Checkpoint...")
            model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
            last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
            print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
            sess.run(tf.variables_initializer(training_ops))
            train_op2 = train_op1
        else:
            print('Initializing Variables...')
            sess.run(tf.global_variables_initializer())

        print('3.Begin training...')

        train_op = train_op2
        print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
        for epoch in range(FLAGS.max_max_epoch):
            global_step = sess.run(model.global_step)
            print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
            if epoch == FLAGS.max_epoch:  # update the embedding
                train_op = train_op1
            train_fetches = [merged, model.loss, train_op, update_op]
            valid_fetches = [merged, model.loss]
            train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
        # 最后再做一次验证
        valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
        print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
            sess.run(model.global_step), valid_cost, precision, recall, f1))
        if f1 > last_f1:  # save the better model
            saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
            print('saved new model to %s ' % saving_path)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_4_han/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_4_han/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

"""wd_4_han
title 部分使用 bigru+attention；content 部分使用 han； 两部分输出直接 concat。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_4_han'
        self.title_len = self.sent_len = 30
        self.doc_len = 10
        self.hidden_size = 256
        self.n_layer = 1
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class HAN(object):
    """
    title: inputs->bigru+attention->output_title
    content: inputs->sent_encoder(bigru+attention)->doc_encoder(bigru+attention)->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.title_len = self.sent_len = settings.sent_len
        self.doc_len = settings.doc_len
        self.hidden_size = settings.hidden_size
        self.n_layer = settings.n_layer
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('bigru_text'):
            output_title = self.bigru_inference(self._X1_inputs)

        with tf.variable_scope('han_content'):
            output_content = self.han_inference(self._X2_inputs)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=1)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def gru_cell(self):
        with tf.name_scope('gru_cell'):
            cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
        return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

    def bi_gru(self, inputs, seg_num):
        """build the bi-GRU network. Return the encoder represented vector.
        n_step: 句子的词数量；或者文档的句子数。
        seg_num: 序列的数量，原本应该为 batch_size, 但是这里将 batch_size 个 doc展开成很多个句子。
        """
        cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
        cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
        initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw]
        initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw]
        outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
                        initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
        # outputs: Output Tensor shaped: seg_num, max_time, layers_output]，其中layers_output=hidden_size * 2 在这里。
        return outputs

    def task_specific_attention(self, inputs, output_size,
                                initializer=layers.xavier_initializer(),
                                activation_fn=tf.tanh, scope=None):
        """
        Performs task-specific attention reduction, using learned
        attention context vector (constant within task of interest).
        Args:
            inputs: Tensor of shape [batch_size, units, input_size]
                `input_size` must be static (known)
                `units` axis will be attended over (reduced from output)
                `batch_size` will be preserved
            output_size: Size of output's inner (feature) dimension
        Returns:
           outputs: Tensor of shape [batch_size, output_dim].
        """
        assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
        with tf.variable_scope(scope or 'attention') as scope:
            # u_w, attention 向量
            attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
                                                       initializer=initializer, dtype=tf.float32)
            # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
            input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
            # 输出 [batch_size, units]
            vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
            attention_weights = tf.nn.softmax(vector_attn, dim=1)
            tf.summary.histogram('attention_weigths', attention_weights)
            weighted_projection = tf.multiply(inputs, attention_weights)
            outputs = tf.reduce_sum(weighted_projection, axis=1)
            return outputs  # 输出 [batch_size, hidden_size*2]

    def bigru_inference(self, X_inputs):
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        output_bigru = self.bi_gru(inputs, self.batch_size)
        output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
        return output_att   # 输出 [batch_size, hidden_size*2]

    def han_inference(self, X_inputs):
        """分层 attention 模型。content部分。
        Args:
            X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
        Returns:
            doc_attn_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru))
        """
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)    # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
        sent_inputs = tf.reshape(inputs,[self.batch_size*self.doc_len, self.sent_len, self.embedding_size]) # shape=(?, 40, 256)
        with tf.variable_scope('sentence_encoder'):  # 生成句向量
            sent_outputs = self.bi_gru(sent_inputs, seg_num=self.batch_size*self.doc_len)
            sent_attn_outputs = self.task_specific_attention(sent_outputs, self.hidden_size*2) # [batch_size*doc_len, hidden_size*2]
            with tf.variable_scope('dropout'):
                sent_attn_outputs = tf.nn.dropout(sent_attn_outputs, self.keep_prob)
        with tf.variable_scope('doc_encoder'):      # 生成文档向量
            doc_inputs = tf.reshape(sent_attn_outputs, [self.batch_size, self.doc_len, self.hidden_size*2])
            doc_outputs = self.bi_gru(doc_inputs, self.batch_size)  # [batch_size, doc_len, hidden_size*2]
            doc_attn_outputs = self.task_specific_attention(doc_outputs, self.hidden_size*2) # [batch_size, hidden_size*2]
        return doc_attn_outputs    # [batch_size, hidden_size*2]



# test the model
def test():
    import numpy as np
    print('Begin testing...')
    settings = Settings()
    W_embedding = np.random.randn(50, 10)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    batch_size = 128
    with tf.Session(config=config) as sess:
        model = HAN(W_embedding, settings)
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.minimize(model.loss)
        update_op = tf.group(*model.update_emas)
        sess.run(tf.global_variables_initializer())
        fetch = [model.loss, model.y_pred, train_op, update_op]
        loss_list = list()
        for i in xrange(100):
            X1_batch = np.zeros((batch_size, 30), dtype=float)
            X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
            y_batch = np.zeros((batch_size, 1999), dtype=int)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
            loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
            loss_list.append(loss)
            print(i, loss)

if __name__ == '__main__':
    test()


================================================
FILE: zhihu-text-classification-master/models/wd_4_han/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/seg_valid/'
data_test_path = '../../data/wd-data/seg_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.HAN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_4_han/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 2, 'update the embedding after max_epoch, default: 2')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式
flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/seg_train/'
data_valid_path = '../../data/wd-data/seg_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def valid_epoch(data_path, sess, model):
    """Test on the valid data."""
    va_batches = os.listdir(data_path)
    n_va_batches = len(va_batches)
    _costs = 0.0
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    for i in range(n_va_batches):
        [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
        marked_labels_list.extend(y_batch)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        fetches = [model.loss, model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        _cost, predict_labels = sess.run(fetches, feed_dict)
        _costs += _cost
        predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    mean_cost = _costs / n_va_batches
    return mean_cost, precision, recall, f1


def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
    global last_f1
    global lr
    time0 = time.time()
    batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
    for batch in tqdm(range(n_tr_batches)):
        global_step = sess.run(model.global_step)
        if 0 == (global_step + 1) % FLAGS.valid_step:
            valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
            print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
                global_step, valid_cost, precision, recall, f1, time.time() - time0))
            time0 = time.time()
            if f1 > last_f1:
                last_f1 = f1
                saving_path = model.saver.save(sess, model_path, global_step+1)
                print('saved new model to %s ' % saving_path)
        # training
        batch_id = batch_indexs[batch]
        [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
        y_batch = to_categorical(y_batch)
        _batch_size = len(y_batch)
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                     model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
        summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
        # valid per 500 steps
        if 0 == (global_step + 1) % 500:
            train_writer.add_summary(summary, global_step)
            batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
            [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
            y_batch = to_categorical(y_batch)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
            summary, _cost = sess.run(valid_fetches, feed_dict)
            test_writer.add_summary(summary, global_step)


def main(_):
    global ckpt_path
    global last_f1
    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
        shutil.rmtree(summary_path)
        os.makedirs(summary_path)
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)

    print('1.Loading data...')
    W_embedding = np.load(embedding_path)
    print('training sample_num = %d' % n_tr_batches)
    print('valid sample_num = %d' % n_va_batches)

    # Initial or restore the model
    print('2.Building model...')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.HAN(W_embedding, settings)
        with tf.variable_scope('training_ops') as vs:
            learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
                                                   FLAGS.decay_rate, staircase=True)
            # two optimizer: op1, update embedding; op2, do not update embedding.
            with tf.variable_scope('Optimizer1'):
                tvars1 = tf.trainable_variables()
                grads1 = tf.gradients(model.loss, tvars1)
                optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
                                                   global_step=model.global_step)
            with tf.variable_scope('Optimizer2'):
                tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
                grads2 = tf.gradients(model.loss, tvars2)
                optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
                train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
                                                   global_step=model.global_step)
            update_op = tf.group(*model.update_emas)
            merged = tf.summary.merge_all()  # summary
            train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(summary_path + 'test')
            training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]

        # 如果已经保存过模型，导入上次的模型
        if os.path.exists(ckpt_path + "checkpoint"):
            print("Restoring Variables from Checkpoint...")
            model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
            last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
            print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
            sess.run(tf.variables_initializer(training_ops))
            train_op2 = train_op1
        else:
            print('Initializing Variables...')
            sess.run(tf.global_variables_initializer())

        print('3.Begin training...')
        print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
        train_op = train_op2
        for epoch in range(FLAGS.max_max_epoch):
            global_step = sess.run(model.global_step)
            print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
            if epoch == FLAGS.max_epoch:  # update the embedding
                train_op = train_op1
            train_fetches = [merged, model.loss, train_op, update_op]
            valid_fetches = [merged, model.loss]
            train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
        # 最后再做一次验证
        valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
        print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
            sess.run(model.global_step), valid_cost, precision, recall, f1))
        if f1 > last_f1:  # save the better model
            saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
            print('saved new model to %s ' % saving_path)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/__init__.py
================================================
# -*- coding:utf-8 -*- 



================================================
FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py
================================================
# -*- coding:utf-8 -*-

import tensorflow as tf
from tensorflow.contrib import rnn
import tensorflow.contrib.layers as layers

"""wd_5_bigru_cnn
两部分使用不同的 embedding， 因为RNN与CNN结构完全不同，共用embedding会降低性能。
title 部分使用 bigru+attention；content 部分使用 textcnn； 两部分输出直接 concat。
"""


class Settings(object):
    def __init__(self):
        self.model_name = 'wd_5_bigru_cnn'
        self.title_len = 30
        self.content_len = 150
        self.hidden_size = 256
        self.n_layer = 1
        self.filter_sizes = [2, 3, 4, 5, 7]
        self.n_filter = 256
        self.fc_hidden_size = 1024
        self.n_class = 1999
        self.summary_path = '../../summary/' + self.model_name + '/'
        self.ckpt_path = '../../ckpt/' + self.model_name + '/'


class BiGRU_CNN(object):
    """
    title: inputs->bigru+attention->output_title
    content: inputs->textcnn->output_content
    concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
    """

    def __init__(self, W_embedding, settings):
        self.model_name = settings.model_name
        self.title_len = settings.title_len
        self.content_len = settings.content_len
        self.hidden_size = settings.hidden_size
        self.n_layer = settings.n_layer
        self.filter_sizes = settings.filter_sizes
        self.n_filter = settings.n_filter
        self.n_filter_total = self.n_filter * len(self.filter_sizes)
        self.n_class = settings.n_class
        self.fc_hidden_size = settings.fc_hidden_size
        self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.update_emas = list()
        # placeholders
        self._tst = tf.placeholder(tf.bool)
        self._keep_prob = tf.placeholder(tf.float32, [])
        self._batch_size = tf.placeholder(tf.int32, [])

        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.title_embedding = tf.get_variable(name='title_embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
            self.content_embedding = tf.get_variable(name='content_embedding', shape=W_embedding.shape,
                                             initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('bigru_text'):
            output_title = self.bigru_inference(self._X1_inputs)

        with tf.variable_scope('cnn_content'):
            output_content = self.cnn_inference(self._X2_inputs, self.content_len)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            W_fc = self.weight_variable([self.hidden_size*2 + self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
            tf.summary.histogram('W_fc', W_fc)
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
            tf.summary.histogram('beta_fc', beta_fc)
            fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
            self.update_emas.append(update_ema_fc)
            self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
            fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
            tf.summary.histogram('Weight_out', W_out)
            b_out = self.bias_variable([self.n_class], name='bias_out')
            tf.summary.histogram('bias_out', b_out)
            self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
            tf.summary.scalar('loss', self._loss)

        self.saver = tf.train.Saver(max_to_keep=1)

    @property
    def tst(self):
        return self._tst

    @property
    def keep_prob(self):
        return self._keep_prob

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def global_step(self):
        return self._global_step

    @property
    def X1_inputs(self):
        return self._X1_inputs

    @property
    def X2_inputs(self):
        return self._X2_inputs

    @property
    def y_inputs(self):
        return self._y_inputs

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def loss(self):
        return self._loss

    def weight_variable(self, shape, name):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    def bias_variable(self, shape, name):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

    def batchnorm(self, Ylogits, offset, convolutional=False):
        """batchnormalization.
        Args:
            Ylogits: 1D向量或者是3D的卷积结果。
            num_updates: 迭代的global_step
            offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
            scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
            m: 表示batch均值；v:表示batch方差。
            bnepsilon：一个很小的浮点数，防止除以 0.
        Returns:
            Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
            update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
        """
        exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
        bnepsilon = 1e-5
        if convolutional:
            mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
        else:
            mean, variance = tf.nn.moments(Ylogits, [0])
        update_moving_everages = exp_moving_avg.apply([mean, variance])
        m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
        v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
        Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
        return Ybn, update_moving_everages

    def gru_cell(self):
        with tf.name_scope('gru_cell'):
            cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
        return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

    def bi_gru(self, inputs):
        """build the bi-GRU network. 返回个所有层的隐含状态。"""
        cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
        cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
        initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
        initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
        outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
                                                            initial_states_fw=initial_states_fw,
                                                            initial_states_bw=initial_states_bw, dtype=tf.float32)
        return outputs

    def task_specific_attention(self, inputs, output_size,
                                initializer=layers.xavier_initializer(),
                                activation_fn=tf.tanh, scope=None):
        """
        Performs task-specific attention reduction, using learned
        attention context vector (constant within task of interest).
        Args:
            inputs: Tensor of shape [batch_size, units, input_size]
                `input_size` must be static (known)
                `units` axis will be attended over (reduced from output)
                `batch_size` will be preserved
            output_size: Size of output's inner (feature) dimension
        Returns:
           outputs: Tensor of shape [batch_size, output_dim].
        """
        assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
        with tf.variable_scope(scope or 'attention') as scope:
            # u_w, attention 向量
            attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
                                                       initializer=initializer, dtype=tf.float32)
            # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
            input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
            # 输出 [batch_size, units]
            vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
            attention_weights = tf.nn.softmax(vector_attn, dim=1)
            tf.summary.histogram('attention_weigths', attention_weights)
            weighted_projection = tf.multiply(inputs, attention_weights)
            outputs = tf.reduce_sum(weighted_projection, axis=1)
            return outputs  # 输出 [batch_size, hidden_size*2]

    def bigru_inference(self, X_inputs):
        inputs = tf.nn.embedding_lookup(self.title_embedding, X_inputs)
        output_bigru = self.bi_gru(inputs)
        output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
        return output_att

    def cnn_inference(self, X_inputs, n_step):
        """TextCNN 模型。
        Args:
            X_inputs: tensor.shape=(batch_size, n_step)
        Returns:
            title_outputs: tensor.shape=(batch_size, self.n_filter_total)
        """
        inputs = tf.nn.embedding_lookup(self.content_embedding, X_inputs)
        inputs = tf.expand_dims(inputs, -1)
        pooled_outputs = list()
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.variable_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
                W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
                beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
                tf.summary.histogram('beta', beta)
                conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)
                # Apply nonlinearity, batch norm scaling is not useful with relus
                h = tf.nn.relu(conv_bn, name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
                                        strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)
                self.update_emas.append(update_ema)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
        return h_pool_flat  # shape = [batch_size, self.n_filter_total]


# test the model
def test():
    import numpy as np
    print('Begin testing...')
    settings = Settings()
    W_embedding = np.random.randn(50, 10)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    batch_size = 128
    with tf.Session(config=config) as sess:
        model = BiGRU_CNN(W_embedding, settings)
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.minimize(model.loss)
        update_op = tf.group(*model.update_emas)
        sess.run(tf.global_variables_initializer())
        fetch = [model.loss, model.y_pred, train_op, update_op]
        loss_list = list()
        for i in xrange(100):
            X1_batch = np.zeros((batch_size, 30), dtype=float)
            X2_batch = np.zeros((batch_size, 150), dtype=float)
            y_batch = np.zeros((batch_size, 1999), dtype=int)
            _batch_size = len(y_batch)
            feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
                         model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
            loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
            loss_list.append(loss)
            print(i, loss)

if __name__ == '__main__':
    test()


================================================
FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import time
import network

sys.path.append('../..')
from evaluator import score_eval

settings = network.Settings()
title_len = settings.title_len
model_name = settings.model_name
ckpt_path = settings.ckpt_path

local_scores_path = '../../local_scores/'
scores_path = '../../scores/'
if not os.path.exists(local_scores_path):
    os.makedirs(local_scores_path)
if not os.path.exists(scores_path):
    os.makedirs(scores_path)

embedding_path = '../../data/word_embedding.npy'
data_valid_path = '../../data/wd-data/data_valid/'
data_test_path = '../../data/wd-data/data_test/'
va_batches = os.listdir(data_valid_path)
te_batches = os.listdir(data_test_path)  # batch 文件名列表
n_va_batches = len(va_batches)
n_te_batches = len(te_batches)


def get_batch(batch_id):
    """get a batch from valid data"""
    new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch, y_batch]


def get_test_batch(batch_id):
    """get a batch from test data"""
    X_batch = np.load(data_test_path + str(batch_id) + '.npy')
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:, title_len:]
    return [X1_batch, X2_batch]


def local_predict(sess, model):
    """Test on the valid data."""
    time0 = time.time()
    predict_labels_list = list()  # 所有的预测结果
    marked_labels_list = list()
    predict_scores = list()
    for i in tqdm(xrange(n_va_batches)):
        [X1_batch, X2_batch, y_batch] = get_batch(i)
        marked_labels_list.extend(y_batch)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
        predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
        predict_labels_list.extend(predict_labels)
    predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
    precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
    print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
    predict_scores = np.vstack(np.asarray(predict_scores))
    local_scores_name = local_scores_path + model_name + '.npy'
    np.save(local_scores_name, predict_scores)
    print('local_scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))


def predict(sess, model):
    """Test on the test data."""
    time0 = time.time()
    predict_scores = list()
    for i in tqdm(xrange(n_te_batches)):
        [X1_batch, X2_batch] = get_test_batch(i)
        _batch_size = len(X1_batch)
        fetches = [model.y_pred]
        feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
                     model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
        predict_labels = sess.run(fetches, feed_dict)[0]
        predict_scores.append(predict_labels)
    predict_scores = np.vstack(np.asarray(predict_scores))
    scores_name = scores_path + model_name + '.npy'
    np.save(scores_name, predict_scores)
    print('scores.shape=', predict_scores.shape)
    print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))


def main(_):
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.BiGRU_CNN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        print('Local predicting...')
        local_predict(sess, model)
        print('Test predicting...')
        predict(sess, model)


if __name__ == '__main__':
    tf.app.run()


================================================
FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py
================================================
# -*- coding:utf-8 -*-

from __future__ import print_function
from __future__ import division
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os
import sys
import shutil
import time
import network

sys.path.append('../..')
from data_helpers import to_categorical
from evaluator import score_eval

flags = tf.flags
flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75')
flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
# 正式
flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')

# 测试
# flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
# flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
# flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
FLAGS = flags.FLAGS

lr = FLAGS.lr
last_f1 = FLAGS.last_f1
settings = network.Settings()
title_len = settings.title_len
summary_path = settings.summary_path
ckpt_path = settings.ckpt_path
model_path = ckpt_path + 'model.ckpt'

embedding_path = '../../data/word_embedding.npy'
data_train_path = '../../data/wd-data/data_train/'
data_valid_path = '../../data/wd-data/data_valid/'
tr_batches = os.listdir(data_train_path)  # batch 文件名列表
va_batches = os.listdir(data_valid_path)
n_tr_batches = len(tr_batches)
n_va_batches = len(va_batches)

# 测试
# n_tr_batches = 1000
# n_va_batches = 50


def get_batch(data_path, batch_id):
    """get a batch from data_path"""
    new_batch = np.load(data_path + str(batch_id) + '.npz')
    X_batch = new_batch['X']
    y_batch = new_batch['y']
    X1_batch = X_batch[:, :title_len]
    X2_batch = X_batch[:

Download .txt

gitextract_uf73lln7/

├── ReadMe.md
└── zhihu-text-classification-master/
    ├── data_process/
    │   ├── .idea/
    │   │   ├── .name
    │   │   ├── data_process.iml
    │   │   ├── deployment.xml
    │   │   ├── encodings.xml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── README.md
    │   ├── char2id.py
    │   ├── creat_batch_data.py
    │   ├── creat_batch_seg.py
    │   ├── embed2ndarray.py
    │   ├── question_and_topic_2id.py
    │   ├── run_all_data_process.sh
    │   ├── test.py
    │   └── word2id.py
    └── models/
        ├── wd_1_1_cnn_concat/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_1_2_cnn_max/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_2_hcnn/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_3_bigru/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_4_han/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        ├── wd_5_bigru_cnn/
        │   ├── __init__.py
        │   ├── network.py
        │   ├── predict.py
        │   └── train.py
        └── wd_6_rcnn/
            ├── __init__.py
            ├── network.py
            ├── predict.py
            └── train.py

Download .txt

SYMBOL INDEX (224 symbols across 28 files)

FILE: zhihu-text-classification-master/data_process/char2id.py
  function get_id (line 23) | def get_id(char):
  function get_id4chars (line 33) | def get_id4chars(chars):
  function test_char2id (line 40) | def test_char2id():
  function train_char2id (line 76) | def train_char2id():

FILE: zhihu-text-classification-master/data_process/creat_batch_data.py
  function topics2ids (line 49) | def topics2ids(topics):
  function get_lables (line 56) | def get_lables():
  function wd_train_get_batch (line 71) | def wd_train_get_batch(title_len=30, content_len=150, batch_size=128):
  function wd_test_get_batch (line 109) | def wd_test_get_batch(title_len=30, content_len=150, batch_size=128):
  function ch_train_get_batch (line 124) | def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
  function ch_test_get_batch (line 159) | def ch_test_get_batch(title_len=52, content_len=300, batch_size=128):

FILE: zhihu-text-classification-master/data_process/creat_batch_seg.py
  function wd_train_get_batch (line 34) | def wd_train_get_batch(title_len=30, batch_size=128):
  function wd_test_get_batch (line 70) | def wd_test_get_batch(title_len=30, batch_size=128):
  function ch_train_get_batch (line 87) | def ch_train_get_batch(title_len=52, batch_size=128):
  function ch_test_get_batch (line 123) | def ch_test_get_batch(title_len=52, batch_size=128):

FILE: zhihu-text-classification-master/data_process/embed2ndarray.py
  function get_word_embedding (line 15) | def get_word_embedding():
  function get_char_embedding (line 43) | def get_char_embedding():

FILE: zhihu-text-classification-master/data_process/question_and_topic_2id.py
  function question_and_topic_2id (line 8) | def question_and_topic_2id():

FILE: zhihu-text-classification-master/data_process/test.py
  function func (line 7) | def func(a, b):

FILE: zhihu-text-classification-master/data_process/word2id.py
  function get_id (line 22) | def get_id(word):
  function get_id4words (line 32) | def get_id4words(words):
  function test_word2id (line 39) | def test_word2id():
  function train_word2id (line 75) | def train_word2id():

FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py
  class Settings (line 10) | class Settings(object):
    method __init__ (line 11) | def __init__(self):
  class TextCNN (line 23) | class TextCNN(object):
    method __init__ (line 30) | def __init__(self, W_embedding, settings):
    method tst (line 89) | def tst(self):
    method keep_prob (line 93) | def keep_prob(self):
    method batch_size (line 97) | def batch_size(self):
    method global_step (line 101) | def global_step(self):
    method X1_inputs (line 105) | def X1_inputs(self):
    method X2_inputs (line 109) | def X2_inputs(self):
    method y_inputs (line 113) | def y_inputs(self):
    method y_pred (line 117) | def y_pred(self):
    method loss (line 121) | def loss(self):
    method weight_variable (line 124) | def weight_variable(self, shape, name):
    method bias_variable (line 129) | def bias_variable(self, shape, name):
    method batchnorm (line 134) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method cnn_inference (line 160) | def cnn_inference(self, X_inputs, n_step):

FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_train_path, sess, model, train_fetches, valid_fetch...
  function main (line 129) | def main(_):

FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py
  class Settings (line 10) | class Settings(object):
    method __init__ (line 11) | def __init__(self):
  class TextCNN (line 23) | class TextCNN(object):
    method __init__ (line 30) | def __init__(self, W_embedding, settings):
    method tst (line 92) | def tst(self):
    method keep_prob (line 96) | def keep_prob(self):
    method batch_size (line 100) | def batch_size(self):
    method global_step (line 104) | def global_step(self):
    method X1_inputs (line 108) | def X1_inputs(self):
    method X2_inputs (line 112) | def X2_inputs(self):
    method y_inputs (line 116) | def y_inputs(self):
    method y_pred (line 120) | def y_pred(self):
    method loss (line 124) | def loss(self):
    method weight_variable (line 127) | def weight_variable(self, shape, name):
    method bias_variable (line 132) | def bias_variable(self, shape, name):
    method batchnorm (line 137) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method cnn_inference (line 163) | def cnn_inference(self, X_inputs, n_step):

FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py
  function get_batch (line 58) | def get_batch(data_path, batch_id):
  function valid_epoch (line 68) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 93) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 130) | def main(_):

FILE: zhihu-text-classification-master/models/wd_2_hcnn/network.py
  class Settings (line 10) | class Settings(object):
    method __init__ (line 11) | def __init__(self):
  class HCNN (line 24) | class HCNN(object):
    method __init__ (line 31) | def __init__(self, W_embedding, settings):
    method tst (line 91) | def tst(self):
    method keep_prob (line 95) | def keep_prob(self):
    method batch_size (line 99) | def batch_size(self):
    method global_step (line 103) | def global_step(self):
    method X1_inputs (line 107) | def X1_inputs(self):
    method X2_inputs (line 111) | def X2_inputs(self):
    method y_inputs (line 115) | def y_inputs(self):
    method y_pred (line 119) | def y_pred(self):
    method loss (line 123) | def loss(self):
    method weight_variable (line 126) | def weight_variable(self, shape, name):
    method bias_variable (line 131) | def bias_variable(self, shape, name):
    method batchnorm (line 136) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method textcnn (line 162) | def textcnn(self, X_inputs, n_step, filter_sizes, embed_size):
    method cnn_inference (line 189) | def cnn_inference(self, X_inputs):
    method hcnn_inference (line 201) | def hcnn_inference(self, X_inputs):

FILE: zhihu-text-classification-master/models/wd_2_hcnn/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_2_hcnn/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 129) | def main(_):

FILE: zhihu-text-classification-master/models/wd_3_bigru/network.py
  class Settings (line 12) | class Settings(object):
    method __init__ (line 13) | def __init__(self):
  class BiGRU (line 25) | class BiGRU(object):
    method __init__ (line 32) | def __init__(self, W_embedding, settings):
    method tst (line 89) | def tst(self):
    method keep_prob (line 93) | def keep_prob(self):
    method batch_size (line 97) | def batch_size(self):
    method global_step (line 101) | def global_step(self):
    method X1_inputs (line 105) | def X1_inputs(self):
    method X2_inputs (line 109) | def X2_inputs(self):
    method y_inputs (line 113) | def y_inputs(self):
    method y_pred (line 117) | def y_pred(self):
    method loss (line 121) | def loss(self):
    method weight_variable (line 124) | def weight_variable(self, shape, name):
    method bias_variable (line 129) | def bias_variable(self, shape, name):
    method batchnorm (line 134) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method gru_cell (line 159) | def gru_cell(self):
    method bi_gru (line 164) | def bi_gru(self, inputs):
    method task_specific_attention (line 175) | def task_specific_attention(self, inputs, output_size,
    method bigru_inference (line 205) | def bigru_inference(self, X_inputs):
  function test (line 213) | def test():

FILE: zhihu-text-classification-master/models/wd_3_bigru/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_3_bigru/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 129) | def main(_):

FILE: zhihu-text-classification-master/models/wd_4_han/network.py
  class Settings (line 12) | class Settings(object):
    method __init__ (line 13) | def __init__(self):
  class HAN (line 25) | class HAN(object):
    method __init__ (line 32) | def __init__(self, W_embedding, settings):
    method tst (line 90) | def tst(self):
    method keep_prob (line 94) | def keep_prob(self):
    method batch_size (line 98) | def batch_size(self):
    method global_step (line 102) | def global_step(self):
    method X1_inputs (line 106) | def X1_inputs(self):
    method X2_inputs (line 110) | def X2_inputs(self):
    method y_inputs (line 114) | def y_inputs(self):
    method y_pred (line 118) | def y_pred(self):
    method loss (line 122) | def loss(self):
    method weight_variable (line 125) | def weight_variable(self, shape, name):
    method bias_variable (line 130) | def bias_variable(self, shape, name):
    method batchnorm (line 135) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method gru_cell (line 160) | def gru_cell(self):
    method bi_gru (line 165) | def bi_gru(self, inputs, seg_num):
    method task_specific_attention (line 179) | def task_specific_attention(self, inputs, output_size,
    method bigru_inference (line 209) | def bigru_inference(self, X_inputs):
    method han_inference (line 215) | def han_inference(self, X_inputs):
  function test (line 238) | def test():

FILE: zhihu-text-classification-master/models/wd_4_han/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_4_han/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 129) | def main(_):

FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py
  class Settings (line 13) | class Settings(object):
    method __init__ (line 14) | def __init__(self):
  class BiGRU_CNN (line 28) | class BiGRU_CNN(object):
    method __init__ (line 35) | def __init__(self, W_embedding, settings):
    method tst (line 98) | def tst(self):
    method keep_prob (line 102) | def keep_prob(self):
    method batch_size (line 106) | def batch_size(self):
    method global_step (line 110) | def global_step(self):
    method X1_inputs (line 114) | def X1_inputs(self):
    method X2_inputs (line 118) | def X2_inputs(self):
    method y_inputs (line 122) | def y_inputs(self):
    method y_pred (line 126) | def y_pred(self):
    method loss (line 130) | def loss(self):
    method weight_variable (line 133) | def weight_variable(self, shape, name):
    method bias_variable (line 138) | def bias_variable(self, shape, name):
    method batchnorm (line 143) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method gru_cell (line 168) | def gru_cell(self):
    method bi_gru (line 173) | def bi_gru(self, inputs):
    method task_specific_attention (line 184) | def task_specific_attention(self, inputs, output_size,
    method bigru_inference (line 214) | def bigru_inference(self, X_inputs):
    method cnn_inference (line 220) | def cnn_inference(self, X_inputs, n_step):
  function test (line 252) | def test():

FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 129) | def main(_):

FILE: zhihu-text-classification-master/models/wd_6_rcnn/network.py
  class Settings (line 15) | class Settings(object):
    method __init__ (line 16) | def __init__(self):
  class RCNN (line 30) | class RCNN(object):
    method __init__ (line 31) | def __init__(self, W_embedding, settings):
    method tst (line 93) | def tst(self):
    method keep_prob (line 97) | def keep_prob(self):
    method batch_size (line 101) | def batch_size(self):
    method global_step (line 105) | def global_step(self):
    method X1_inputs (line 109) | def X1_inputs(self):
    method X2_inputs (line 113) | def X2_inputs(self):
    method y_inputs (line 117) | def y_inputs(self):
    method y_pred (line 121) | def y_pred(self):
    method loss (line 125) | def loss(self):
    method weight_variable (line 128) | def weight_variable(self, shape, name):
    method bias_variable (line 133) | def bias_variable(self, shape, name):
    method batchnorm (line 138) | def batchnorm(self, Ylogits, offset, convolutional=False):
    method gru_cell (line 164) | def gru_cell(self):
    method bi_gru (line 169) | def bi_gru(self, X_inputs):
    method textcnn (line 185) | def textcnn(self, cnn_inputs, n_step):
    method rcnn_inference (line 210) | def rcnn_inference(self, X_inputs, n_step):
  function test (line 217) | def test():

FILE: zhihu-text-classification-master/models/wd_6_rcnn/predict.py
  function get_batch (line 37) | def get_batch(batch_id):
  function get_test_batch (line 47) | def get_test_batch(batch_id):
  function local_predict (line 55) | def local_predict(sess, model):
  function predict (line 82) | def predict(sess, model):
  function main (line 101) | def main(_):

FILE: zhihu-text-classification-master/models/wd_6_rcnn/train.py
  function get_batch (line 57) | def get_batch(data_path, batch_id):
  function valid_epoch (line 67) | def valid_epoch(data_path, sess, model):
  function train_epoch (line 92) | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, tr...
  function main (line 129) | def main(_):

Download .json

Condensed preview — 45 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (243K chars).

[
  {
    "path": "ReadMe.md",
    "chars": 74,
    "preview": "# 竞赛列表\n+ [2017 知乎看山杯机器学习挑战赛](https://www.biendata.com/competition/zhihu/)\n"
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/.name",
    "chars": 12,
    "preview": "data_process"
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/data_process.iml",
    "chars": 459,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n  <component name=\"NewModuleRootManager"
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/deployment.xml",
    "chars": 373,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"PublishConfigData\">\n    <serverData>\n   "
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/encodings.xml",
    "chars": 238,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"Encoding\" native2AsciiForPropertiesFiles"
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/misc.xml",
    "chars": 687,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectLevelVcsManager\" settingsEditedMa"
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/modules.xml",
    "chars": 276,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectModuleManager\">\n    <modules>\n   "
  },
  {
    "path": "zhihu-text-classification-master/data_process/.idea/workspace.xml",
    "chars": 23759,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ChangeListManager\">\n    <list default=\"t"
  },
  {
    "path": "zhihu-text-classification-master/data_process/README.md",
    "chars": 1929,
    "preview": "## 数据处理\n\n1.把比赛提供的所有数据解压到 raw_data/ 目录下。<br/>\n2.按照顺序依次执行各个 .py，不带任何参数。<br/>\n  或者在当前目录下输入下面命令运行所有文件：<br/>\n  dos2unix run_a"
  },
  {
    "path": "zhihu-text-classification-master/data_process/char2id.py",
    "chars": 4371,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport"
  },
  {
    "path": "zhihu-text-classification-master/data_process/creat_batch_data.py",
    "chars": 5832,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport"
  },
  {
    "path": "zhihu-text-classification-master/data_process/creat_batch_seg.py",
    "chars": 4803,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nfrom m"
  },
  {
    "path": "zhihu-text-classification-master/data_process/embed2ndarray.py",
    "chars": 2658,
    "preview": "# -*- coding:utf-8 -*- \n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimpor"
  },
  {
    "path": "zhihu-text-classification-master/data_process/question_and_topic_2id.py",
    "chars": 1577,
    "preview": "# -*- coding:utf-8 -*- \n\nimport pandas as pd\nimport pickle\nfrom itertools import chain\n\n\ndef question_and_topic_2id():\n "
  },
  {
    "path": "zhihu-text-classification-master/data_process/run_all_data_process.sh",
    "chars": 521,
    "preview": "#!/usr/bin/env bash\necho -e \"\\033[44;37;5m RUNNING embed2ndarray.py\\033[0m \";\npython embed2ndarray.py;\necho -e \"\\033[44;"
  },
  {
    "path": "zhihu-text-classification-master/data_process/test.py",
    "chars": 219,
    "preview": "# -*- coding:utf-8 -*-\n\n\nfrom multiprocessing import Pool\nimport numpy as np\n\ndef func(a, b):\n    return a+b\n\np = Pool()"
  },
  {
    "path": "zhihu-text-classification-master/data_process/word2id.py",
    "chars": 4224,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport numpy as np\nimport"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_1_cnn_concat/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py",
    "chars": 9305,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\n\n\"\"\"wd_1_1_cnn_concat\ntitle 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出直接 c"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py",
    "chars": 4313,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py",
    "chars": 9309,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_2_cnn_max/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py",
    "chars": 9463,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\n\n\"\"\"wd_1_2_cnn_max\ntitle 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出按位取 max"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py",
    "chars": 4319,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py",
    "chars": 9605,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_2_hcnn/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_2_hcnn/network.py",
    "chars": 11172,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\n\n\"\"\"wd_2_hcnn\ntitle 部分使用 TextCNN；content 部分使用分层的 TextCNN。\n\"\"\"\n\n\nclass Se"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_2_hcnn/predict.py",
    "chars": 4314,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_2_hcnn/train.py",
    "chars": 9599,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_3_bigru/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_3_bigru/network.py",
    "chars": 10403,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\nfrom tensorflow.contrib import rnn\nimport tensorflow.contrib.layers as l"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_3_bigru/predict.py",
    "chars": 4315,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_3_bigru/train.py",
    "chars": 9603,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_4_han/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_4_han/network.py",
    "chars": 12141,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\nfrom tensorflow.contrib import rnn\nimport tensorflow.contrib.layers as l"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_4_han/predict.py",
    "chars": 4313,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_4_han/train.py",
    "chars": 9594,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_5_bigru_cnn/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py",
    "chars": 12722,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\nfrom tensorflow.contrib import rnn\nimport tensorflow.contrib.layers as l"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py",
    "chars": 4321,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py",
    "chars": 9602,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_6_rcnn/__init__.py",
    "chars": 25,
    "preview": "# -*- coding:utf-8 -*- \n\n"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_6_rcnn/network.py",
    "chars": 10829,
    "preview": "# -*- coding:utf-8 -*-\n\nimport tensorflow as tf\nfrom tensorflow.contrib import rnn\nimport tensorflow.contrib.layers as l"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_6_rcnn/predict.py",
    "chars": 4316,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  },
  {
    "path": "zhihu-text-classification-master/models/wd_6_rcnn/train.py",
    "chars": 9601,
    "preview": "# -*- coding:utf-8 -*-\n\nfrom __future__ import print_function\nfrom __future__ import division\nimport tensorflow as tf\nim"
  }
]

About this extraction

This page contains the full source code of the Happy-zyy/Competition GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 45 files (220.1 KB), approximately 59.5k tokens, and a symbol index with 224 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo