Copy disabled (too large)
Download .txt
Showing preview only (18,475K chars total). Download the full file to get everything.
Repository: huanghuidmml/cail2019_track2
Branch: master
Commit: 3bf263f662c5
Files: 47
Total size: 8.6 MB
Directory structure:
gitextract_xilhx8ju/
├── .idea/
│ ├── Cail2019_track2.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── bert/
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── convert.py
├── createPretrainData.py
├── data/
│ ├── divorce/
│ │ ├── data_small_selected.json
│ │ ├── tags.txt
│ │ └── train_selected.json
│ ├── labor/
│ │ ├── data_small_selected.json
│ │ ├── tags.txt
│ │ └── train_selected.json
│ └── loan/
│ ├── data_small_selected.json
│ ├── tags.txt
│ └── train_selected.json
├── evaluation.py
├── genPretrainData.py
├── run_pretrain.py
├── search_threshold.py
├── train.py
└── utils/
├── __init__.py
├── ckpt2pb.py
├── evaluate.py
├── models.py
└── predict.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .idea/Cail2019_track2.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
================================================
FILE: .idea/misc.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (2)" project-jdk-type="Python SDK" />
</project>
================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Cail2019_track2.iml" filepath="$PROJECT_DIR$/.idea/Cail2019_track2.iml" />
</modules>
</component>
</project>
================================================
FILE: .idea/workspace.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="79a6edf2-9a9b-4f4c-8230-083b8bfb579f" name="Default Changelist" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/Cail2019_track2$genPretrainData.coverage" NAME="genPretrainData Coverage Results" MODIFIED="1569814645269" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/Cail2019_track2$a.coverage" NAME="a Coverage Results" MODIFIED="1569818481213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
</component>
<component name="FUSProjectUsageTrigger">
<session id="-1749523040">
<usages-collector id="statistics.lifecycle.project">
<counts>
<entry key="project.closed" value="1" />
<entry key="project.open.time.0" value="1" />
<entry key="project.opened" value="1" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="json" value="1" />
<entry key="py" value="39" />
<entry key="txt" value="3" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
<counts>
<entry key="JSON" value="1" />
<entry key="PLAIN_TEXT" value="3" />
<entry key="Python" value="39" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
<counts>
<entry key="dummy" value="38" />
<entry key="py" value="1896" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
<counts>
<entry key="PLAIN_TEXT" value="38" />
<entry key="Python" value="1896" />
</counts>
</usages-collector>
</session>
</component>
<component name="FileEditorManager">
<leaf>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/run_pretrain.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1739">
<caret line="54" column="20" lean-forward="true" selection-start-line="54" selection-start-column="20" selection-end-line="54" selection-end-column="20" />
<folding>
<element signature="e#683#721#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/evaluation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="217">
<caret line="24" lean-forward="true" selection-start-line="24" selection-end-line="24" />
<folding>
<element signature="e#90#101#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/train.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="351">
<caret line="58" selection-start-line="58" selection-end-line="60" selection-end-column="60" />
<folding>
<element signature="e#735#773#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/utils/models.py" />
<option value="$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [4].py" />
<option value="$PROJECT_DIR$/utils/ckpt2pb.py" />
<option value="$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/ckpt2pb [2].py" />
<option value="$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [5].py" />
<option value="$PROJECT_DIR$/convert.py" />
<option value="$PROJECT_DIR$/utils/evaluate.py" />
<option value="$PROJECT_DIR$/evaluate.py" />
<option value="$PROJECT_DIR$/utils/predict.py" />
<option value="$PROJECT_DIR$/search_threshold.py" />
<option value="$PROJECT_DIR$/create_data.py" />
<option value="$PROJECT_DIR$/genPretrainData.py" />
<option value="$PROJECT_DIR$/train.py" />
<option value="$PROJECT_DIR$/run_pretrain.py" />
<option value="$PROJECT_DIR$/a.py" />
<option value="$PROJECT_DIR$/createPretrainData.py" />
<option value="$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/run_pretrain.py" />
<option value="$PROJECT_DIR$/evaluation.py" />
</list>
</option>
</component>
<component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
<component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
<component name="JsGulpfileManager">
<detection-done>true</detection-done>
<sorting>DEFINITION_ORDER</sorting>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="-8" />
<option name="y" value="-8" />
<option name="width" value="1936" />
<option name="height" value="1056" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="Cail2019_track2" type="b2602c69:ProjectViewProjectNode" />
<item name="cail2019_track2" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="Cail2019_track2" type="b2602c69:ProjectViewProjectNode" />
<item name="cail2019_track2" type="462c0819:PsiDirectoryNode" />
<item name="utils" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="G:\法研杯\代码整理\cail2019_track2\data" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.a">
<configuration name="a" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Cail2019_track2" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/a.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="genPretrainData" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Cail2019_track2" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/genPretrainData.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<list>
<item itemvalue="Python.a" />
<item itemvalue="Python.genPretrainData" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.a" />
<item itemvalue="Python.genPretrainData" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="79a6edf2-9a9b-4f4c-8230-083b8bfb579f" name="Default Changelist" comment="" />
<created>1569804343348</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1569804343348</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1056" extended-state="6" />
<editor active="true" />
<layout>
<window_info id="Favorites" side_tool="true" />
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info anchor="bottom" id="Docker" show_stripe_button="false" />
<window_info anchor="bottom" id="Database Changes" show_stripe_button="false" />
<window_info anchor="bottom" id="Version Control" show_stripe_button="false" />
<window_info anchor="bottom" id="Python Console" />
<window_info anchor="bottom" id="Terminal" />
<window_info anchor="bottom" id="Event Log" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.329718" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="right" id="SciView" />
<window_info anchor="right" id="Database" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/data/divorce/tags.txt">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [4].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-108">
<caret line="49" column="20" lean-forward="true" selection-start-line="49" selection-start-column="20" selection-end-line="49" selection-end-column="20" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [3].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="54">
<caret line="17" column="31" lean-forward="true" selection-start-line="17" selection-start-column="31" selection-end-line="17" selection-end-column="31" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/data/divorce/train_selected.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/predict [9].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="403">
<caret line="194" column="32" lean-forward="true" selection-start-line="194" selection-start-column="32" selection-end-line="194" selection-end-column="32" />
<folding>
<element signature="e#0#23#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/ckpt2pb [2].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="162">
<caret line="69" lean-forward="true" selection-start-line="69" selection-end-line="69" />
<folding>
<element signature="e#0#25#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/convert.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="351">
<caret line="13" column="35" lean-forward="true" selection-start-line="13" selection-start-column="35" selection-end-line="13" selection-end-column="35" />
<folding>
<element signature="e#0#33#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [5].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="513">
<caret line="48" lean-forward="true" selection-start-line="48" selection-end-line="48" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/train [6].py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="405">
<caret line="50" column="26" selection-start-line="50" selection-start-column="19" selection-end-line="50" selection-end-column="26" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/utils/ckpt2pb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-540">
<caret line="58" column="40" lean-forward="true" selection-start-line="58" selection-start-column="40" selection-end-line="58" selection-end-column="40" />
<folding>
<element signature="e#89#114#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/utils/models.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1404">
<caret line="85" column="60" lean-forward="true" selection-start-line="85" selection-start-column="60" selection-end-line="85" selection-end-column="60" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/extend_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-256">
<caret line="83" column="31" lean-forward="true" selection-start-line="83" selection-start-column="31" selection-end-line="83" selection-end-column="31" />
<folding>
<element signature="e#0#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/evaluate.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="54">
<caret line="2" column="13" lean-forward="true" selection-end-line="111" selection-end-column="20" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/judge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="135">
<caret line="5" column="18" lean-forward="true" selection-start-line="5" selection-start-column="18" selection-end-line="5" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/utils/evaluate.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="514">
<caret line="106" lean-forward="true" selection-start-line="106" selection-end-line="106" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/utils/predict.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2348">
<caret line="94" column="22" selection-start-line="94" selection-start-column="13" selection-end-line="94" selection-end-column="22" />
<folding>
<element signature="e#89#112#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/search.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="538">
<caret line="119" column="26" lean-forward="true" selection-start-line="119" selection-start-column="26" selection-end-line="119" selection-end-column="26" />
<folding>
<element signature="e#0#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/search_threshold.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-593">
<caret line="75" column="33" lean-forward="true" selection-start-line="75" selection-start-column="33" selection-end-line="75" selection-end-column="33" />
<folding>
<element signature="e#90#125#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/data/divorce/pretrain.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-486" />
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/genPretrainData.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="189">
<caret line="19" column="41" lean-forward="true" selection-start-line="19" selection-start-column="41" selection-end-line="19" selection-end-column="41" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/createPretrainData.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="406">
<caret line="445" column="40" lean-forward="true" selection-start-line="445" selection-start-column="40" selection-end-line="445" selection-end-column="40" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/a.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="8" column="33" selection-start-line="8" selection-start-column="6" selection-end-line="8" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/train.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="351">
<caret line="58" selection-start-line="58" selection-end-line="60" selection-end-column="60" />
<folding>
<element signature="e#735#773#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/createPretrainData.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="379">
<caret line="445" column="44" lean-forward="true" selection-start-line="445" selection-start-column="44" selection-end-line="445" selection-end-column="44" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/Documents/NetSarang Computer/6/Xftp/Temporary/run_pretrain.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-108">
<caret line="24" lean-forward="true" selection-start-line="24" selection-end-line="24" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/convert.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="27">
<caret line="1" column="21" lean-forward="true" selection-start-line="1" selection-start-column="21" selection-end-line="1" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/evaluation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="217">
<caret line="24" lean-forward="true" selection-start-line="24" selection-end-line="24" />
<folding>
<element signature="e#90#101#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/run_pretrain.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1739">
<caret line="54" column="20" lean-forward="true" selection-start-line="54" selection-start-column="20" selection-end-line="54" selection-end-column="20" />
<folding>
<element signature="e#683#721#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>
================================================
FILE: README.md
================================================
# cail2019_track2
中国法研杯CAIL2019要素抽取任务第三名方案分享
====
欢迎大家使用[tensorflow1.x的bert系列模型库,支持单机多卡,梯度累积,自动导出pb部署](https://github.com/huanghuidmml/textToy)
(修改了一下readme,之前那一版感觉写的太水了。)
这次比赛和前两名差距很大,但是也在此给大家分享一下我所用的方案。
主要的trick包括领域预训练、focal loss、阈值移动、规则匹配以及模型优化、调参。
没有使用模型融合。
### **效果对比**
由于是第一次参赛,很多比赛细节没有做记录,效果对比的分数是我从凭印象在上传历史记录里边找的,可能分数不一致,但是大概就在那个范围,还请见谅。
| Model | 详情 | 线上评分 |
| :------: | :------: | :------: |
| BERT | 使用bert_base做多标签分类 | 69.553 |
| BERT+RCNN+ATT | 在BERT后增加RCNN层,并把最大池化换成Attention | 70.143 |
| BERT+RCNN+ATT | 增加阈值移动 | 70.809 |
| BERT+RCNN+ATT | 增加focal loss | 71.126 |
| BERT+RCNN+ATT | 增加规则 | 72.2 |
| BERT+RCNN+ATT | 使用比赛数据预训练BERT | 72.526 |
| BERT+RCNN+ATT | copy样本正例少的数据(divorce loan有效) | 72.909 |
| BERT+RCNN+ATT | 在比赛数据基础上增加裁判文书(1000篇)做预训练(labor有效) | 73.483 |
| BERT+RCNN+ATT | 增加否定词规则 | 73.533 |
### **主要参数**
| 参数名 | 参数值 |
| :------: | :------: |
| 预训练模型 | [BERT_Base_Chinese](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip) |
| max_length(divorce) | 128 |
| max_length(labor) | 150 |
| max_length(loan) | 200 |
| batch_size | 32 |
| learning_rate | 2e-5 |
| num_train_epochs | 30 |
| alpha(focal loss) | 0.25 |
| gamma(focal loss) | 2 |
| hidden_dim(lstm) | 200 |
**方案介绍**
------
### **任务简介**
根据给定司法文书中的相关段落,识别相应的关键案情要素,其中每个句子对应的类别标签个数不定,属于多标签问题。任务共涉及三个领域,包括婚姻家庭、劳动争议、借款合同。
例如:
| 例句 | 标签 |
| :------: | :------: |
| 高雷红提出诉讼请求:1、判令被告支付原告的工资1630×0.8×4=5216元; | ["LB2"] |
| 原告范合诉称:原告系被告处职工。 | [] |
| 5、判令被告某甲公司支付2011年9月8日至2012年8月7日未签订劳动合同的二倍工资差额16,298.37元; | ["LB9", "LB6"] |
根据数据集,我选定的方案就是传统的多标签分类方法。bert预训练模型使用的是google开源的bert_base_chinese.
### **任务难点**
* **正负例样本不均衡**
* **有的要素标签正例仅有几条,模型无法学习**
### **解决方案**
#### **focal loss**
减少易分类样本的权重,增加难分类样本的损失贡献值,参数见上表的alpha,gamma
#### **阈值移动**
将比赛的数据集切分为训练集和测试集。先用训练集去训练模型,
然后使用测试集去测试模型,筛选阈值;最后把所有数据拿去训练最后的提交模型,
预测阈值就采用之前筛选出来的阈值。
#### **copy少量数据**
数据增强我尝试过eda,但是效果不行,不如不用,后来使用copy的方法做数据增强,
将正例少的样本copy一定的数量,但是不能copy太多,否则会严重破坏分布。
而且这个方法我只在divorce和loan两种领域有提升,labor上下降了,
可能是copy量不合理,大家可以下去尝试修改一下,看下会不会提升。
#### **模型优化**
最后使用的模型是BERT + RCNN,并且RCNN部分的最大池化修改为Attention。
主要方法就是将BERT的输出向量X输入BiLstm,得到一个特征向量H,最后将X和H
拼接送入Attention。
#### **规则**
规则主要是为了修正模型无法学习的要素标签,使用的方式:首先通过
标签的解释说明和包含标签的样本确定规则,规则在python中使用的是正则
表达式;然后针对需要预测的文本,我们先使用正则表达式去匹配,若是
匹配成功,则说明文本包含该规则对应的标签;最后把规则匹配出来的标签与
模型预测的标签取并集,得到最终预测要素集。
规则举例:
> ['.(保证合同|抵押合同|借款合同).(无效|不发生效力).*']
,对应的要素是LN12。
**否定词规则**
否定词规则的意思是:在采用规则修正的时候,若是句子以一些否定词结尾,规则将不生效。
举例:
> 被告五金公司辩称本案借款合同和保证合同均无效,缺乏法律依据,本院**不予采纳**。
> 实际标签: LN13 LN10
这个句子可以匹配到我们写的LN12的规则:‘.*(保证合同|抵押合同|借款合同).*(无效|不发生效力).*‘
但是因为末尾出现了不予采纳,所以该标签规则不生效,没有LN12。
#### **领域预训练**
bert模型采用的是bert_base_chinese,如果使用徐亮大佬的roberta应该还会有提升。
司法领域属于特殊领域,所以使用比赛数据先做了一次预训练,在三种领域都有一定的提升,
后边我爬取一些裁判文书来做预训练,可能是因为数据量小和质量不够,只在labor上得到了
提升,如果保证数据量和质量,应该会有提升。
**代码说明**
-------
#### **基本代码**
CUDA_VISIBLE_DEVICES=1是指定第一块显卡,根据具体情况自己改,
如果CPU的话就不用了。
**训练**
> CUDA_VISIBLE_DEVICES=1 python train.py
**将ckpt转为pb**
> CUDA_VISIBLE_DEVICES=1 python convert.py
**线下测试**
> CUDA_VISIBLE_DEVICES=1 python evaluation.py
#### **如果需要额外预训练的话,使用以下代码**
**创建预训练数据txt**
> python genPretrainData.py
**创建预训练数据的tfrecord文件**
> python createPretrainData.py
**预训练**
> CUDA_VISIBLE_DEVICES=1 python run_pretrain.py
**Reference**
-----
1. [TensorFlow code and pre-trained models for BERT](https://github.com/google-research/bert)
2. [The implementation of focal loss proposed on "Focal Loss for Dense Object Detection" by KM He and support for multi-label dataset.](https://github.com/ailias/Focal-Loss-implement-on-Tensorflow)
**感谢**
-----
感谢队友牧笛的帮助
================================================
FILE: bert/.gitignore
================================================
# Initially taken from Github's Python gitignore file
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
================================================
FILE: bert/CONTRIBUTING.md
================================================
# How to Contribute
BERT needs to maintain permanent compatibility with the pre-trained model files,
so we do not plan to make any major changes to this library (other than what was
promised in the README). However, we can accept small patches related to
re-factoring and documentation. To submit contributes, there are just a few
small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
## Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
================================================
FILE: bert/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: bert/README.md
================================================
# BERT
**\*\*\*\*\* New February 7th, 2019: TfHub Module \*\*\*\*\***
BERT has been uploaded to [TensorFlow Hub](https://tfhub.dev). See
`run_classifier_with_tfhub.py` for an example of how to use the TF Hub module,
or run an example in the browser on [Colab](https://colab.sandbox.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb).
**\*\*\*\*\* New November 23rd, 2018: Un-normalized multilingual model + Thai +
Mongolian \*\*\*\*\***
We uploaded a new multilingual model which does *not* perform any normalization
on the input (no lower casing, accent stripping, or Unicode normalization), and
additionally inclues Thai and Mongolian.
**It is recommended to use this version for developing multilingual models,
especially on languages with non-Latin alphabets.**
This does not require any code changes, and can be downloaded here:
* **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
**\*\*\*\*\* New November 15th, 2018: SOTA SQuAD 2.0 System \*\*\*\*\***
We released code changes to reproduce our 83% F1 SQuAD 2.0 system, which is
currently 1st place on the leaderboard by 3%. See the SQuAD 2.0 section of the
README for details.
**\*\*\*\*\* New November 5th, 2018: Third-party PyTorch and Chainer versions of
BERT available \*\*\*\*\***
NLP researchers from HuggingFace made a
[PyTorch version of BERT available](https://github.com/huggingface/pytorch-pretrained-BERT)
which is compatible with our pre-trained checkpoints and is able to reproduce
our results. Sosuke Kobayashi also made a
[Chainer version of BERT available](https://github.com/soskek/bert-chainer)
(Thanks!) We were not involved in the creation or maintenance of the PyTorch
implementation so please direct any questions towards the authors of that
repository.
**\*\*\*\*\* New November 3rd, 2018: Multilingual and Chinese models available
\*\*\*\*\***
We have made two new BERT models available:
* **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
(Not recommended, use `Multilingual Cased` instead)**: 102 languages,
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
We use character-based tokenization for Chinese, and WordPiece tokenization for
all other languages. Both models should work out-of-the-box without any code
changes. We did update the implementation of `BasicTokenizer` in
`tokenization.py` to support Chinese character tokenization, so please update if
you forked it. However, we did not change the tokenization API.
For more, see the
[Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md).
**\*\*\*\*\* End new information \*\*\*\*\***
## Introduction
**BERT**, or **B**idirectional **E**ncoder **R**epresentations from
**T**ransformers, is a new method of pre-training language representations which
obtains state-of-the-art results on a wide array of Natural Language Processing
(NLP) tasks.
Our academic paper which describes BERT in detail and provides full results on a
number of tasks can be found here:
[https://arxiv.org/abs/1810.04805](https://arxiv.org/abs/1810.04805).
To give a few numbers, here are the results on the
[SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/) question answering
task:
SQuAD v1.1 Leaderboard (Oct 8th 2018) | Test EM | Test F1
------------------------------------- | :------: | :------:
1st Place Ensemble - BERT | **87.4** | **93.2**
2nd Place Ensemble - nlnet | 86.0 | 91.7
1st Place Single Model - BERT | **85.1** | **91.8**
2nd Place Single Model - nlnet | 83.5 | 90.1
And several natural language inference tasks:
System | MultiNLI | Question NLI | SWAG
----------------------- | :------: | :----------: | :------:
BERT | **86.7** | **91.1** | **86.3**
OpenAI GPT (Prev. SOTA) | 82.2 | 88.1 | 75.0
Plus many other tasks.
Moreover, these results were all obtained with almost no task-specific neural
network architecture design.
If you already know what BERT is and you just want to get started, you can
[download the pre-trained models](#pre-trained-models) and
[run a state-of-the-art fine-tuning](#fine-tuning-with-bert) in only a few
minutes.
## What is BERT?
BERT is a method of pre-training language representations, meaning that we train
a general-purpose "language understanding" model on a large text corpus (like
Wikipedia), and then use that model for downstream NLP tasks that we care about
(like question answering). BERT outperforms previous methods because it is the
first *unsupervised*, *deeply bidirectional* system for pre-training NLP.
*Unsupervised* means that BERT was trained using only a plain text corpus, which
is important because an enormous amount of plain text data is publicly available
on the web in many languages.
Pre-trained representations can also either be *context-free* or *contextual*,
and contextual representations can further be *unidirectional* or
*bidirectional*. Context-free models such as
[word2vec](https://www.tensorflow.org/tutorials/representation/word2vec) or
[GloVe](https://nlp.stanford.edu/projects/glove/) generate a single "word
embedding" representation for each word in the vocabulary, so `bank` would have
the same representation in `bank deposit` and `river bank`. Contextual models
instead generate a representation of each word that is based on the other words
in the sentence.
BERT was built upon recent work in pre-training contextual representations —
including [Semi-supervised Sequence Learning](https://arxiv.org/abs/1511.01432),
[Generative Pre-Training](https://blog.openai.com/language-unsupervised/),
[ELMo](https://allennlp.org/elmo), and
[ULMFit](http://nlp.fast.ai/classification/2018/05/15/introducting-ulmfit.html)
— but crucially these models are all *unidirectional* or *shallowly
bidirectional*. This means that each word is only contextualized using the words
to its left (or right). For example, in the sentence `I made a bank deposit` the
unidirectional representation of `bank` is only based on `I made a` but not
`deposit`. Some previous work does combine the representations from separate
left-context and right-context models, but only in a "shallow" manner. BERT
represents "bank" using both its left and right context — `I made a ... deposit`
— starting from the very bottom of a deep neural network, so it is *deeply
bidirectional*.
BERT uses a simple approach for this: We mask out 15% of the words in the input,
run the entire sequence through a deep bidirectional
[Transformer](https://arxiv.org/abs/1706.03762) encoder, and then predict only
the masked words. For example:
```
Input: the man went to the [MASK1] . he bought a [MASK2] of milk.
Labels: [MASK1] = store; [MASK2] = gallon
```
In order to learn relationships between sentences, we also train on a simple
task which can be generated from any monolingual corpus: Given two sentences `A`
and `B`, is `B` the actual next sentence that comes after `A`, or just a random
sentence from the corpus?
```
Sentence A: the man went to the store .
Sentence B: he bought a gallon of milk .
Label: IsNextSentence
```
```
Sentence A: the man went to the store .
Sentence B: penguins are flightless .
Label: NotNextSentence
```
We then train a large model (12-layer to 24-layer Transformer) on a large corpus
(Wikipedia + [BookCorpus](http://yknzhu.wixsite.com/mbweb)) for a long time (1M
update steps), and that's BERT.
Using BERT has two stages: *Pre-training* and *fine-tuning*.
**Pre-training** is fairly expensive (four days on 4 to 16 Cloud TPUs), but is a
one-time procedure for each language (current models are English-only, but
multilingual models will be released in the near future). We are releasing a
number of pre-trained models from the paper which were pre-trained at Google.
Most NLP researchers will never need to pre-train their own model from scratch.
**Fine-tuning** is inexpensive. All of the results in the paper can be
replicated in at most 1 hour on a single Cloud TPU, or a few hours on a GPU,
starting from the exact same pre-trained model. SQuAD, for example, can be
trained in around 30 minutes on a single Cloud TPU to achieve a Dev F1 score of
91.0%, which is the single system state-of-the-art.
The other important aspect of BERT is that it can be adapted to many types of
NLP tasks very easily. In the paper, we demonstrate state-of-the-art results on
sentence-level (e.g., SST-2), sentence-pair-level (e.g., MultiNLI), word-level
(e.g., NER), and span-level (e.g., SQuAD) tasks with almost no task-specific
modifications.
## What has been released in this repository?
We are releasing the following:
* TensorFlow code for the BERT model architecture (which is mostly a standard
[Transformer](https://arxiv.org/abs/1706.03762) architecture).
* Pre-trained checkpoints for both the lowercase and cased version of
`BERT-Base` and `BERT-Large` from the paper.
* TensorFlow code for push-button replication of the most important
fine-tuning experiments from the paper, including SQuAD, MultiNLI, and MRPC.
All of the code in this repository works out-of-the-box with CPU, GPU, and Cloud
TPU.
## Pre-trained models
We are releasing the `BERT-Base` and `BERT-Large` models from the paper.
`Uncased` means that the text has been lowercased before WordPiece tokenization,
e.g., `John Smith` becomes `john smith`. The `Uncased` model also strips out any
accent markers. `Cased` means that the true case and accent markers are
preserved. Typically, the `Uncased` model is better unless you know that case
information is important for your task (e.g., Named Entity Recognition or
Part-of-Speech tagging).
These models are all released under the same license as the source code (Apache
2.0).
For information about the Multilingual and Chinese model, see the
[Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md).
**When using a cased model, make sure to pass `--do_lower=False` to the training
scripts. (Or pass `do_lower_case=False` directly to `FullTokenizer` if you're
using your own script.)**
The links to the models are here (right-click, 'Save link as...' on the name):
* **[`BERT-Base, Uncased`](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip)**:
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Large, Uncased`](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Cased`](https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip)**:
12-layer, 768-hidden, 12-heads , 110M parameters
* **[`BERT-Large, Cased`](https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
(Not recommended, use `Multilingual Cased` instead)**: 102 languages,
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
Each .zip file contains three items:
* A TensorFlow checkpoint (`bert_model.ckpt`) containing the pre-trained
weights (which is actually 3 files).
* A vocab file (`vocab.txt`) to map WordPiece to word id.
* A config file (`bert_config.json`) which specifies the hyperparameters of
the model.
## Fine-tuning with BERT
**Important**: All results on the paper were fine-tuned on a single Cloud TPU,
which has 64GB of RAM. It is currently not possible to re-produce most of the
`BERT-Large` results on the paper using a GPU with 12GB - 16GB of RAM, because
the maximum batch size that can fit in memory is too small. We are working on
adding code to this repository which allows for much larger effective batch size
on the GPU. See the section on [out-of-memory issues](#out-of-memory-issues) for
more details.
This code was tested with TensorFlow 1.11.0. It was tested with Python2 and
Python3 (but more thoroughly with Python2, since this is what's used internally
in Google).
The fine-tuning examples which use `BERT-Base` should be able to run on a GPU
that has at least 12GB of RAM using the hyperparameters given.
### Fine-tuning with Cloud TPUs
Most of the examples below assumes that you will be running training/evaluation
on your local machine, using a GPU like a Titan X or GTX 1080.
However, if you have access to a Cloud TPU that you want to train on, just add
the following flags to `run_classifier.py` or `run_squad.py`:
```
--use_tpu=True \
--tpu_name=$TPU_NAME
```
Please see the
[Google Cloud TPU tutorial](https://cloud.google.com/tpu/docs/tutorials/mnist)
for how to use Cloud TPUs. Alternatively, you can use the Google Colab notebook
"[BERT FineTuning with Cloud TPUs](https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)".
On Cloud TPUs, the pretrained model and the output directory will need to be on
Google Cloud Storage. For example, if you have a bucket named `some_bucket`, you
might use the following flags instead:
```
--output_dir=gs://some_bucket/my_output_dir/
```
The unzipped pre-trained model files can also be found in the Google Cloud
Storage folder `gs://bert_models/2018_10_18`. For example:
```
export BERT_BASE_DIR=gs://bert_models/2018_10_18/uncased_L-12_H-768_A-12
```
### Sentence (and sentence-pair) classification tasks
Before running this example you must download the
[GLUE data](https://gluebenchmark.com/tasks) by running
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
and unpack it to some directory `$GLUE_DIR`. Next, download the `BERT-Base`
checkpoint and unzip it to some directory `$BERT_BASE_DIR`.
This example code fine-tunes `BERT-Base` on the Microsoft Research Paraphrase
Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
few minutes on most GPUs.
```shell
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
export GLUE_DIR=/path/to/glue
python run_classifier.py \
--task_name=MRPC \
--do_train=true \
--do_eval=true \
--data_dir=$GLUE_DIR/MRPC \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--max_seq_length=128 \
--train_batch_size=32 \
--learning_rate=2e-5 \
--num_train_epochs=3.0 \
--output_dir=/tmp/mrpc_output/
```
You should see output like this:
```
***** Eval results *****
eval_accuracy = 0.845588
eval_loss = 0.505248
global_step = 343
loss = 0.505248
```
This means that the Dev set accuracy was 84.55%. Small sets like MRPC have a
high variance in the Dev set accuracy, even when starting from the same
pre-training checkpoint. If you re-run multiple times (making sure to point to
different `output_dir`), you should see results between 84% and 88%.
A few other pre-trained models are implemented off-the-shelf in
`run_classifier.py`, so it should be straightforward to follow those examples to
use BERT for any single-sentence or sentence-pair classification task.
Note: You might see a message `Running train on CPU`. This really just means
that it's running on something other than a Cloud TPU, which includes a GPU.
#### Prediction from classifier
Once you have trained your classifier you can use it in inference mode by using
the --do_predict=true command. You need to have a file named test.tsv in the
input folder. Output will be created in file called test_results.tsv in the
output folder. Each line will contain output for each sample, columns are the
class probabilities.
```shell
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
export GLUE_DIR=/path/to/glue
export TRAINED_CLASSIFIER=/path/to/fine/tuned/classifier
python run_classifier.py \
--task_name=MRPC \
--do_predict=true \
--data_dir=$GLUE_DIR/MRPC \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$TRAINED_CLASSIFIER \
--max_seq_length=128 \
--output_dir=/tmp/mrpc_output/
```
### SQuAD 1.1
The Stanford Question Answering Dataset (SQuAD) is a popular question answering
benchmark dataset. BERT (at the time of the release) obtains state-of-the-art
results on SQuAD with almost no task-specific network architecture modifications
or data augmentation. However, it does require semi-complex data pre-processing
and post-processing to deal with (a) the variable-length nature of SQuAD context
paragraphs, and (b) the character-level answer annotations which are used for
SQuAD training. This processing is implemented and documented in `run_squad.py`.
To run on SQuAD, you will first need to download the dataset. The
[SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) does not seem to
link to the v1.1 datasets any longer, but the necessary files can be found here:
* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
Download these to some directory `$SQUAD_DIR`.
The state-of-the-art SQuAD results from the paper currently cannot be reproduced
on a 12GB-16GB GPU due to memory constraints (in fact, even batch size 1 does
not seem to fit on a 12GB GPU using `BERT-Large`). However, a reasonably strong
`BERT-Base` model can be trained on the GPU with these hyperparameters:
```shell
python run_squad.py \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--do_train=True \
--train_file=$SQUAD_DIR/train-v1.1.json \
--do_predict=True \
--predict_file=$SQUAD_DIR/dev-v1.1.json \
--train_batch_size=12 \
--learning_rate=3e-5 \
--num_train_epochs=2.0 \
--max_seq_length=384 \
--doc_stride=128 \
--output_dir=/tmp/squad_base/
```
The dev set predictions will be saved into a file called `predictions.json` in
the `output_dir`:
```shell
python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
```
Which should produce an output like this:
```shell
{"f1": 88.41249612335034, "exact_match": 81.2488174077578}
```
You should see a result similar to the 88.5% reported in the paper for
`BERT-Base`.
If you have access to a Cloud TPU, you can train with `BERT-Large`. Here is a
set of hyperparameters (slightly different than the paper) which consistently
obtain around 90.5%-91.0% F1 single-system trained only on SQuAD:
```shell
python run_squad.py \
--vocab_file=$BERT_LARGE_DIR/vocab.txt \
--bert_config_file=$BERT_LARGE_DIR/bert_config.json \
--init_checkpoint=$BERT_LARGE_DIR/bert_model.ckpt \
--do_train=True \
--train_file=$SQUAD_DIR/train-v1.1.json \
--do_predict=True \
--predict_file=$SQUAD_DIR/dev-v1.1.json \
--train_batch_size=24 \
--learning_rate=3e-5 \
--num_train_epochs=2.0 \
--max_seq_length=384 \
--doc_stride=128 \
--output_dir=gs://some_bucket/squad_large/ \
--use_tpu=True \
--tpu_name=$TPU_NAME
```
For example, one random run with these parameters produces the following Dev
scores:
```shell
{"f1": 90.87081895814865, "exact_match": 84.38978240302744}
```
If you fine-tune for one epoch on
[TriviaQA](http://nlp.cs.washington.edu/triviaqa/) before this the results will
be even better, but you will need to convert TriviaQA into the SQuAD json
format.
### SQuAD 2.0
This model is also implemented and documented in `run_squad.py`.
To run on SQuAD 2.0, you will first need to download the dataset. The necessary
files can be found here:
* [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
* [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
* [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
Download these to some directory `$SQUAD_DIR`.
On Cloud TPU you can run with BERT-Large as follows:
```shell
python run_squad.py \
--vocab_file=$BERT_LARGE_DIR/vocab.txt \
--bert_config_file=$BERT_LARGE_DIR/bert_config.json \
--init_checkpoint=$BERT_LARGE_DIR/bert_model.ckpt \
--do_train=True \
--train_file=$SQUAD_DIR/train-v2.0.json \
--do_predict=True \
--predict_file=$SQUAD_DIR/dev-v2.0.json \
--train_batch_size=24 \
--learning_rate=3e-5 \
--num_train_epochs=2.0 \
--max_seq_length=384 \
--doc_stride=128 \
--output_dir=gs://some_bucket/squad_large/ \
--use_tpu=True \
--tpu_name=$TPU_NAME \
--version_2_with_negative=True
```
We assume you have copied everything from the output directory to a local
directory called ./squad/. The initial dev set predictions will be at
./squad/predictions.json and the differences between the score of no answer ("")
and the best non-null answer for each question will be in the file
./squad/null_odds.json
Run this script to tune a threshold for predicting null versus non-null answers:
python $SQUAD_DIR/evaluate-v2.0.py $SQUAD_DIR/dev-v2.0.json
./squad/predictions.json --na-prob-file ./squad/null_odds.json
Assume the script outputs "best_f1_thresh" THRESH. (Typical values are between
-1.0 and -5.0). You can now re-run the model to generate predictions with the
derived threshold or alternatively you can extract the appropriate answers from
./squad/nbest_predictions.json.
```shell
python run_squad.py \
--vocab_file=$BERT_LARGE_DIR/vocab.txt \
--bert_config_file=$BERT_LARGE_DIR/bert_config.json \
--init_checkpoint=$BERT_LARGE_DIR/bert_model.ckpt \
--do_train=False \
--train_file=$SQUAD_DIR/train-v2.0.json \
--do_predict=True \
--predict_file=$SQUAD_DIR/dev-v2.0.json \
--train_batch_size=24 \
--learning_rate=3e-5 \
--num_train_epochs=2.0 \
--max_seq_length=384 \
--doc_stride=128 \
--output_dir=gs://some_bucket/squad_large/ \
--use_tpu=True \
--tpu_name=$TPU_NAME \
--version_2_with_negative=True \
--null_score_diff_threshold=$THRESH
```
### Out-of-memory issues
All experiments in the paper were fine-tuned on a Cloud TPU, which has 64GB of
device RAM. Therefore, when using a GPU with 12GB - 16GB of RAM, you are likely
to encounter out-of-memory issues if you use the same hyperparameters described
in the paper.
The factors that affect memory usage are:
* **`max_seq_length`**: The released models were trained with sequence lengths
up to 512, but you can fine-tune with a shorter max sequence length to save
substantial memory. This is controlled by the `max_seq_length` flag in our
example code.
* **`train_batch_size`**: The memory usage is also directly proportional to
the batch size.
* **Model type, `BERT-Base` vs. `BERT-Large`**: The `BERT-Large` model
requires significantly more memory than `BERT-Base`.
* **Optimizer**: The default optimizer for BERT is Adam, which requires a lot
of extra memory to store the `m` and `v` vectors. Switching to a more memory
efficient optimizer can reduce memory usage, but can also affect the
results. We have not experimented with other optimizers for fine-tuning.
Using the default training scripts (`run_classifier.py` and `run_squad.py`), we
benchmarked the maximum batch size on single Titan X GPU (12GB RAM) with
TensorFlow 1.11.0:
System | Seq Length | Max Batch Size
------------ | ---------- | --------------
`BERT-Base` | 64 | 64
... | 128 | 32
... | 256 | 16
... | 320 | 14
... | 384 | 12
... | 512 | 6
`BERT-Large` | 64 | 12
... | 128 | 6
... | 256 | 2
... | 320 | 1
... | 384 | 0
... | 512 | 0
Unfortunately, these max batch sizes for `BERT-Large` are so small that they
will actually harm the model accuracy, regardless of the learning rate used. We
are working on adding code to this repository which will allow much larger
effective batch sizes to be used on the GPU. The code will be based on one (or
both) of the following techniques:
* **Gradient accumulation**: The samples in a minibatch are typically
independent with respect to gradient computation (excluding batch
normalization, which is not used here). This means that the gradients of
multiple smaller minibatches can be accumulated before performing the weight
update, and this will be exactly equivalent to a single larger update.
* [**Gradient checkpointing**](https://github.com/openai/gradient-checkpointing):
The major use of GPU/TPU memory during DNN training is caching the
intermediate activations in the forward pass that are necessary for
efficient computation in the backward pass. "Gradient checkpointing" trades
memory for compute time by re-computing the activations in an intelligent
way.
**However, this is not implemented in the current release.**
## Using BERT to extract fixed feature vectors (like ELMo)
In certain cases, rather than fine-tuning the entire pre-trained model
end-to-end, it can be beneficial to obtained *pre-trained contextual
embeddings*, which are fixed contextual representations of each input token
generated from the hidden layers of the pre-trained model. This should also
mitigate most of the out-of-memory issues.
As an example, we include the script `extract_features.py` which can be used
like this:
```shell
# Sentence A and Sentence B are separated by the ||| delimiter for sentence
# pair tasks like question answering and entailment.
# For single sentence inputs, put one sentence per line and DON'T use the
# delimiter.
echo 'Who was Jim Henson ? ||| Jim Henson was a puppeteer' > /tmp/input.txt
python extract_features.py \
--input_file=/tmp/input.txt \
--output_file=/tmp/output.jsonl \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--layers=-1,-2,-3,-4 \
--max_seq_length=128 \
--batch_size=8
```
This will create a JSON file (one line per line of input) containing the BERT
activations from each Transformer layer specified by `layers` (-1 is the final
hidden layer of the Transformer, etc.)
Note that this script will produce very large output files (by default, around
15kb for every input token).
If you need to maintain alignment between the original and tokenized words (for
projecting training labels), see the [Tokenization](#tokenization) section
below.
**Note:** You may see a message like `Could not find trained model in model_dir:
/tmp/tmpuB5g5c, running initialization to predict.` This message is expected, it
just means that we are using the `init_from_checkpoint()` API rather than the
saved model API. If you don't specify a checkpoint or specify an invalid
checkpoint, this script will complain.
## Tokenization
For sentence-level tasks (or sentence-pair) tasks, tokenization is very simple.
Just follow the example code in `run_classifier.py` and `extract_features.py`.
The basic procedure for sentence-level tasks is:
1. Instantiate an instance of `tokenizer = tokenization.FullTokenizer`
2. Tokenize the raw text with `tokens = tokenizer.tokenize(raw_text)`.
3. Truncate to the maximum sequence length. (You can use up to 512, but you
probably want to use shorter if possible for memory and speed reasons.)
4. Add the `[CLS]` and `[SEP]` tokens in the right place.
Word-level and span-level tasks (e.g., SQuAD and NER) are more complex, since
you need to maintain alignment between your input text and output text so that
you can project your training labels. SQuAD is a particularly complex example
because the input labels are *character*-based, and SQuAD paragraphs are often
longer than our maximum sequence length. See the code in `run_squad.py` to show
how we handle this.
Before we describe the general recipe for handling word-level tasks, it's
important to understand what exactly our tokenizer is doing. It has three main
steps:
1. **Text normalization**: Convert all whitespace characters to spaces, and
(for the `Uncased` model) lowercase the input and strip out accent markers.
E.g., `John Johanson's, → john johanson's,`.
2. **Punctuation splitting**: Split *all* punctuation characters on both sides
(i.e., add whitespace around all punctuation characters). Punctuation
characters are defined as (a) Anything with a `P*` Unicode class, (b) any
non-letter/number/space ASCII character (e.g., characters like `$` which are
technically not punctuation). E.g., `john johanson's, → john johanson ' s ,`
3. **WordPiece tokenization**: Apply whitespace tokenization to the output of
the above procedure, and apply
[WordPiece](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py)
tokenization to each token separately. (Our implementation is directly based
on the one from `tensor2tensor`, which is linked). E.g., `john johanson ' s
, → john johan ##son ' s ,`
The advantage of this scheme is that it is "compatible" with most existing
English tokenizers. For example, imagine that you have a part-of-speech tagging
task which looks like this:
```
Input: John Johanson 's house
Labels: NNP NNP POS NN
```
The tokenized output will look like this:
```
Tokens: john johan ##son ' s house
```
Crucially, this would be the same output as if the raw text were `John
Johanson's house` (with no space before the `'s`).
If you have a pre-tokenized representation with word-level annotations, you can
simply tokenize each input word independently, and deterministically maintain an
original-to-tokenized alignment:
```python
### Input
orig_tokens = ["John", "Johanson", "'s", "house"]
labels = ["NNP", "NNP", "POS", "NN"]
### Output
bert_tokens = []
# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.
orig_to_tok_map = []
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=True)
bert_tokens.append("[CLS]")
for orig_token in orig_tokens:
orig_to_tok_map.append(len(bert_tokens))
bert_tokens.extend(tokenizer.tokenize(orig_token))
bert_tokens.append("[SEP]")
# bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"]
# orig_to_tok_map == [1, 2, 4, 6]
```
Now `orig_to_tok_map` can be used to project `labels` to the tokenized
representation.
There are common English tokenization schemes which will cause a slight mismatch
between how BERT was pre-trained. For example, if your input tokenization splits
off contractions like `do n't`, this will cause a mismatch. If it is possible to
do so, you should pre-process your data to convert these back to raw-looking
text, but if it's not possible, this mismatch is likely not a big deal.
## Pre-training with BERT
We are releasing code to do "masked LM" and "next sentence prediction" on an
arbitrary text corpus. Note that this is *not* the exact code that was used for
the paper (the original code was written in C++, and had some additional
complexity), but this code does generate pre-training data as described in the
paper.
Here's how to run the data generation. The input is a plain text file, with one
sentence per line. (It is important that these be actual sentences for the "next
sentence prediction" task). Documents are delimited by empty lines. The output
is a set of `tf.train.Example`s serialized into `TFRecord` file format.
You can perform sentence segmentation with an off-the-shelf NLP toolkit such as
[spaCy](https://spacy.io/). The `create_pretraining_data.py` script will
concatenate segments until they reach the maximum sequence length to minimize
computational waste from padding (see the script for more details). However, you
may want to intentionally add a slight amount of noise to your input data (e.g.,
randomly truncate 2% of input segments) to make it more robust to non-sentential
input during fine-tuning.
This script stores all of the examples for the entire input file in memory, so
for large data files you should shard the input file and call the script
multiple times. (You can pass in a file glob to `run_pretraining.py`, e.g.,
`tf_examples.tf_record*`.)
The `max_predictions_per_seq` is the maximum number of masked LM predictions per
sequence. You should set this to around `max_seq_length` * `masked_lm_prob` (the
script doesn't do that automatically because the exact value needs to be passed
to both scripts).
```shell
python create_pretraining_data.py \
--input_file=./sample_text.txt \
--output_file=/tmp/tf_examples.tfrecord \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--do_lower_case=True \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=5
```
Here's how to run the pre-training. Do not include `init_checkpoint` if you are
pre-training from scratch. The model configuration (including vocab size) is
specified in `bert_config_file`. This demo code only pre-trains for a small
number of steps (20), but in practice you will probably want to set
`num_train_steps` to 10000 steps or more. The `max_seq_length` and
`max_predictions_per_seq` parameters passed to `run_pretraining.py` must be the
same as `create_pretraining_data.py`.
```shell
python run_pretraining.py \
--input_file=/tmp/tf_examples.tfrecord \
--output_dir=/tmp/pretraining_output \
--do_train=True \
--do_eval=True \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--train_batch_size=32 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--num_train_steps=20 \
--num_warmup_steps=10 \
--learning_rate=2e-5
```
This will produce an output like this:
```
***** Eval results *****
global_step = 20
loss = 0.0979674
masked_lm_accuracy = 0.985479
masked_lm_loss = 0.0979328
next_sentence_accuracy = 1.0
next_sentence_loss = 3.45724e-05
```
Note that since our `sample_text.txt` file is very small, this example training
will overfit that data in only a few steps and produce unrealistically high
accuracy numbers.
### Pre-training tips and caveats
* **If using your own vocabulary, make sure to change `vocab_size` in
`bert_config.json`. If you use a larger vocabulary without changing this,
you will likely get NaNs when training on GPU or TPU due to unchecked
out-of-bounds access.**
* If your task has a large domain-specific corpus available (e.g., "movie
reviews" or "scientific papers"), it will likely be beneficial to run
additional steps of pre-training on your corpus, starting from the BERT
checkpoint.
* The learning rate we used in the paper was 1e-4. However, if you are doing
additional steps of pre-training starting from an existing BERT checkpoint,
you should use a smaller learning rate (e.g., 2e-5).
* Current BERT models are English-only, but we do plan to release a
multilingual model which has been pre-trained on a lot of languages in the
near future (hopefully by the end of November 2018).
* Longer sequences are disproportionately expensive because attention is
quadratic to the sequence length. In other words, a batch of 64 sequences of
length 512 is much more expensive than a batch of 256 sequences of
length 128. The fully-connected/convolutional cost is the same, but the
attention cost is far greater for the 512-length sequences. Therefore, one
good recipe is to pre-train for, say, 90,000 steps with a sequence length of
128 and then for 10,000 additional steps with a sequence length of 512. The
very long sequences are mostly needed to learn positional embeddings, which
can be learned fairly quickly. Note that this does require generating the
data twice with different values of `max_seq_length`.
* If you are pre-training from scratch, be prepared that pre-training is
computationally expensive, especially on GPUs. If you are pre-training from
scratch, our recommended recipe is to pre-train a `BERT-Base` on a single
[preemptible Cloud TPU v2](https://cloud.google.com/tpu/docs/pricing), which
takes about 2 weeks at a cost of about $500 USD (based on the pricing in
October 2018). You will have to scale down the batch size when only training
on a single Cloud TPU, compared to what was used in the paper. It is
recommended to use the largest batch size that fits into TPU memory.
### Pre-training data
We will **not** be able to release the pre-processed datasets used in the paper.
For Wikipedia, the recommended pre-processing is to download
[the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2),
extract the text with
[`WikiExtractor.py`](https://github.com/attardi/wikiextractor), and then apply
any necessary cleanup to convert it into plain text.
Unfortunately the researchers who collected the
[BookCorpus](http://yknzhu.wixsite.com/mbweb) no longer have it available for
public download. The
[Project Guttenberg Dataset](https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html)
is a somewhat smaller (200M word) collection of older books that are public
domain.
[Common Crawl](http://commoncrawl.org/) is another very large collection of
text, but you will likely have to do substantial pre-processing and cleanup to
extract a usable corpus for pre-training BERT.
### Learning a new WordPiece vocabulary
This repository does not include code for *learning* a new WordPiece vocabulary.
The reason is that the code used in the paper was implemented in C++ with
dependencies on Google's internal libraries. For English, it is almost always
better to just start with our vocabulary and pre-trained models. For learning
vocabularies of other languages, there are a number of open source options
available. However, keep in mind that these are not compatible with our
`tokenization.py` library:
* [Google's SentencePiece library](https://github.com/google/sentencepiece)
* [tensor2tensor's WordPiece generation script](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder_build_subword.py)
* [Rico Sennrich's Byte Pair Encoding library](https://github.com/rsennrich/subword-nmt)
## Using BERT in Colab
If you want to use BERT with [Colab](https://colab.research.google.com), you can
get started with the notebook
"[BERT FineTuning with Cloud TPUs](https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)".
**At the time of this writing (October 31st, 2018), Colab users can access a
Cloud TPU completely for free.** Note: One per user, availability limited,
requires a Google Cloud Platform account with storage (although storage may be
purchased with free credit for signing up with GCP), and this capability may not
longer be available in the future. Click on the BERT Colab that was just linked
for more information.
## FAQ
#### Is this code compatible with Cloud TPUs? What about GPUs?
Yes, all of the code in this repository works out-of-the-box with CPU, GPU, and
Cloud TPU. However, GPU training is single-GPU only.
#### I am getting out-of-memory errors, what is wrong?
See the section on [out-of-memory issues](#out-of-memory-issues) for more
information.
#### Is there a PyTorch version available?
There is no official PyTorch implementation. However, NLP researchers from
HuggingFace made a
[PyTorch version of BERT available](https://github.com/huggingface/pytorch-pretrained-BERT)
which is compatible with our pre-trained checkpoints and is able to reproduce
our results. We were not involved in the creation or maintenance of the PyTorch
implementation so please direct any questions towards the authors of that
repository.
#### Is there a Chainer version available?
There is no official Chainer implementation. However, Sosuke Kobayashi made a
[Chainer version of BERT available](https://github.com/soskek/bert-chainer)
which is compatible with our pre-trained checkpoints and is able to reproduce
our results. We were not involved in the creation or maintenance of the Chainer
implementation so please direct any questions towards the authors of that
repository.
#### Will models in other languages be released?
Yes, we plan to release a multi-lingual BERT model in the near future. We cannot
make promises about exactly which languages will be included, but it will likely
be a single model which includes *most* of the languages which have a
significantly-sized Wikipedia.
#### Will models larger than `BERT-Large` be released?
So far we have not attempted to train anything larger than `BERT-Large`. It is
possible that we will release larger models if we are able to obtain significant
improvements.
#### What license is this library released under?
All code *and* models are released under the Apache 2.0 license. See the
`LICENSE` file for more information.
#### How do I cite BERT?
For now, cite [the Arxiv paper](https://arxiv.org/abs/1810.04805):
```
@article{devlin2018bert,
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
journal={arXiv preprint arXiv:1810.04805},
year={2018}
}
```
If we submit the paper to a conference or journal, we will update the BibTeX.
## Disclaimer
This is not an official Google product.
## Contact information
For help or issues using BERT, please submit a GitHub issue.
For personal communication related to BERT, please contact Jacob Devlin
(`jacobdevlin@google.com`), Ming-Wei Chang (`mingweichang@google.com`), or
Kenton Lee (`kentonl@google.com`).
================================================
FILE: bert/__init__.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: bert/create_pretraining_data.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
from bert import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None,
"Input raw text file (or comma-separated list of files).")
flags.DEFINE_string(
"output_file", None,
"Output TF example file (or comma-separated list of files).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
flags.DEFINE_integer("max_predictions_per_seq", 20,
"Maximum number of masked LM predictions per sequence.")
flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
flags.DEFINE_integer(
"dupe_factor", 10,
"Number of times to duplicate the input data (with different masks).")
flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
flags.DEFINE_float(
"short_seq_prob", 0.1,
"Probability of creating sequences which are shorter than the "
"maximum length.")
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
def __repr__(self):
return self.__str__()
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_files):
"""Create TF example files from `TrainingInstance`s."""
writers = []
for output_file in output_files:
writers.append(tf.python_io.TFRecordWriter(output_file))
writer_index = 0
total_written = 0
for (inst_index, instance) in enumerate(instances):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
features["next_sentence_labels"] = create_int_feature([next_sentence_label])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writers[writer_index].write(tf_example.SerializeToString())
writer_index = (writer_index + 1) % len(writers)
total_written += 1
if inst_index < 20:
tf.logging.info("*** Example ***")
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in instance.tokens]))
for feature_name in features.keys():
feature = features[feature_name]
values = []
if feature.int64_list.value:
values = feature.int64_list.value
elif feature.float_list.value:
values = feature.float_list.value
tf.logging.info(
"%s: %s" % (feature_name, " ".join([str(x) for x in values])))
for writer in writers:
writer.close()
tf.logging.info("Wrote %d total instances", total_written)
def create_int_feature(values):
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return feature
def create_float_feature(values):
feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return feature
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
tokens = tokenizer.tokenize(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(0, len(all_documents) - 1)
if random_document_index != document_index:
break
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_file")
flags.mark_flag_as_required("vocab_file")
tf.app.run()
================================================
FILE: bert/extract_features.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import json
import re
from bert import modeling
from bert import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None, "")
flags.DEFINE_string("output_file", None, "")
flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
flags.DEFINE_string("master", None,
"If using a TPU, the address of the master.")
flags.DEFINE_integer(
"num_tpu_cores", 8,
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
flags.DEFINE_bool(
"use_one_hot_embeddings", False,
"If True, tf.one_hot will be used for embedding lookups, otherwise "
"tf.nn.embedding_lookup will be used. On TPUs, this should be True "
"since it is much faster.")
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map,
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=FLAGS.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=FLAGS.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(FLAGS.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
layer_indexes=layer_indexes,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
model_fn=model_fn,
config=run_config,
predict_batch_size=FLAGS.batch_size)
input_fn = input_fn_builder(
features=features, seq_length=FLAGS.max_seq_length)
with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
"w")) as writer:
for result in estimator.predict(input_fn, yield_single_examples=True):
unique_id = int(result["unique_id"])
feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = result["layer_output_%d" % j]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("init_checkpoint")
flags.mark_flag_as_required("output_file")
tf.app.run()
================================================
FILE: bert/modeling.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The main BERT model and related functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import json
import math
import re
import numpy as np
import six
import tensorflow as tf
class BertConfig(object):
"""Configuration for `BertModel`."""
def __init__(self,
vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02):
"""Constructs BertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
@classmethod
def from_dict(cls, json_object):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config = BertConfig(vocab_size=None)
for (key, value) in six.iteritems(json_object):
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with tf.gfile.GFile(json_file, "r") as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class BertModel(object):
"""BERT model ("Bidirectional Encoder Representations from Transformers").
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
model = modeling.BertModel(config=config, is_training=True,
input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
label_embeddings = tf.get_variable(...)
pooled_output = model.get_pooled_output()
logits = tf.matmul(pooled_output, label_embeddings)
...
```
"""
def __init__(self,
config,
is_training,
input_ids,
input_mask=None,
token_type_ids=None,
use_one_hot_embeddings=False,
scope=None):
"""Constructor for BertModel.
Args:
config: `BertConfig` instance.
is_training: bool. true for training model, false for eval model. Controls
whether dropout will be applied.
input_ids: int32 Tensor of shape [batch_size, seq_length].
input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
embeddings or tf.embedding_lookup() for the word embeddings.
scope: (optional) variable scope. Defaults to "bert".
Raises:
ValueError: The config is invalid or one of the input tensor shapes
is invalid.
"""
config = copy.deepcopy(config)
if not is_training:
config.hidden_dropout_prob = 0.0
config.attention_probs_dropout_prob = 0.0
input_shape = get_shape_list(input_ids, expected_rank=2)
batch_size = input_shape[0]
seq_length = input_shape[1]
if input_mask is None:
input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
if token_type_ids is None:
token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
with tf.variable_scope(scope, default_name="bert"):
with tf.variable_scope("embeddings"):
# Perform embedding lookup on the word ids.
(self.embedding_output, self.embedding_table) = embedding_lookup(
input_ids=input_ids,
vocab_size=config.vocab_size,
embedding_size=config.hidden_size,
initializer_range=config.initializer_range,
word_embedding_name="word_embeddings",
use_one_hot_embeddings=use_one_hot_embeddings)
# Add positional embeddings and token type embeddings, then layer
# normalize and perform dropout.
self.embedding_output = embedding_postprocessor(
input_tensor=self.embedding_output,
use_token_type=True,
token_type_ids=token_type_ids,
token_type_vocab_size=config.type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=config.initializer_range,
max_position_embeddings=config.max_position_embeddings,
dropout_prob=config.hidden_dropout_prob)
with tf.variable_scope("encoder"):
# This converts a 2D mask of shape [batch_size, seq_length] to a 3D
# mask of shape [batch_size, seq_length, seq_length] which is used
# for the attention scores.
attention_mask = create_attention_mask_from_input_mask(
input_ids, input_mask)
# Run the stacked transformer.
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
self.all_encoder_layers = transformer_model(
input_tensor=self.embedding_output,
attention_mask=attention_mask,
hidden_size=config.hidden_size,
num_hidden_layers=config.num_hidden_layers,
num_attention_heads=config.num_attention_heads,
intermediate_size=config.intermediate_size,
intermediate_act_fn=get_activation(config.hidden_act),
hidden_dropout_prob=config.hidden_dropout_prob,
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
initializer_range=config.initializer_range,
do_return_all_layers=True)
self.sequence_output = self.all_encoder_layers[-1]
# The "pooler" converts the encoded sequence tensor of shape
# [batch_size, seq_length, hidden_size] to a tensor of shape
# [batch_size, hidden_size]. This is necessary for segment-level
# (or segment-pair-level) classification tasks where we need a fixed
# dimensional representation of the segment.
with tf.variable_scope("pooler"):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token. We assume that this has been pre-trained
first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
self.pooled_output = tf.layers.dense(
first_token_tensor,
config.hidden_size,
activation=tf.tanh,
kernel_initializer=create_initializer(config.initializer_range))
def get_pooled_output(self):
return self.pooled_output
def get_sequence_output(self):
"""Gets final hidden layer of encoder.
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the final hidden of the transformer encoder.
"""
return self.sequence_output
def get_all_encoder_layers(self):
return self.all_encoder_layers
def get_embedding_output(self):
"""Gets output of the embedding lookup (i.e., input to the transformer).
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the output of the embedding layer, after summing the word
embeddings with the positional embeddings and the token type embeddings,
then performing layer normalization. This is the input to the transformer.
"""
return self.embedding_output
def get_embedding_table(self):
return self.embedding_table
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def get_activation(activation_string):
"""Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
Args:
activation_string: String name of the activation function.
Returns:
A Python function corresponding to the activation function. If
`activation_string` is None, empty, or "linear", this will return None.
If `activation_string` is not a string, it will return `activation_string`.
Raises:
ValueError: The `activation_string` does not correspond to a known
activation.
"""
# We assume that anything that"s not a string is already an activation
# function, so we just return it.
if not isinstance(activation_string, six.string_types):
return activation_string
if not activation_string:
return None
act = activation_string.lower()
if act == "linear":
return None
elif act == "relu":
return tf.nn.relu
elif act == "gelu":
return gelu
elif act == "tanh":
return tf.tanh
else:
raise ValueError("Unsupported activation: %s" % act)
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
"""Compute the union of the current variables and checkpoint variables."""
assignment_map = {}
initialized_variable_names = {}
name_to_variable = collections.OrderedDict()
for var in tvars:
name = var.name
m = re.match("^(.*):\\d+$", name)
if m is not None:
name = m.group(1)
name_to_variable[name] = var
init_vars = tf.train.list_variables(init_checkpoint)
assignment_map = collections.OrderedDict()
for x in init_vars:
(name, var) = (x[0], x[1])
if name not in name_to_variable:
continue
assignment_map[name] = name
initialized_variable_names[name] = 1
initialized_variable_names[name + ":0"] = 1
return (assignment_map, initialized_variable_names)
def dropout(input_tensor, dropout_prob):
"""Perform dropout.
Args:
input_tensor: float Tensor.
dropout_prob: Python float. The probability of dropping out a value (NOT of
*keeping* a dimension as in `tf.nn.dropout`).
Returns:
A version of `input_tensor` with dropout applied.
"""
if dropout_prob is None or dropout_prob == 0.0:
return input_tensor
output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
return output
def layer_norm(input_tensor, name=None):
"""Run layer normalization on the last dimension of the tensor."""
return tf.contrib.layers.layer_norm(
inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
"""Runs layer normalization followed by dropout."""
output_tensor = layer_norm(input_tensor, name)
output_tensor = dropout(output_tensor, dropout_prob)
return output_tensor
def create_initializer(initializer_range=0.02):
"""Creates a `truncated_normal_initializer` with the given range."""
return tf.truncated_normal_initializer(stddev=initializer_range)
def embedding_lookup(input_ids,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings",
use_one_hot_embeddings=False):
"""Looks up words embeddings for id tensor.
Args:
input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
ids.
vocab_size: int. Size of the embedding vocabulary.
embedding_size: int. Width of the word embeddings.
initializer_range: float. Embedding initialization range.
word_embedding_name: string. Name of the embedding table.
use_one_hot_embeddings: bool. If True, use one-hot method for word
embeddings. If False, use `tf.gather()`.
Returns:
float Tensor of shape [batch_size, seq_length, embedding_size].
"""
# This function assumes that the input is of shape [batch_size, seq_length,
# num_inputs].
#
# If the input is a 2D tensor of shape [batch_size, seq_length], we
# reshape to [batch_size, seq_length, 1].
if input_ids.shape.ndims == 2:
input_ids = tf.expand_dims(input_ids, axis=[-1])
embedding_table = tf.get_variable(
name=word_embedding_name,
shape=[vocab_size, embedding_size],
initializer=create_initializer(initializer_range))
flat_input_ids = tf.reshape(input_ids, [-1])
if use_one_hot_embeddings:
one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
output = tf.matmul(one_hot_input_ids, embedding_table)
else:
output = tf.gather(embedding_table, flat_input_ids)
input_shape = get_shape_list(input_ids)
output = tf.reshape(output,
input_shape[0:-1] + [input_shape[-1] * embedding_size])
return (output, embedding_table)
def embedding_postprocessor(input_tensor,
use_token_type=False,
token_type_ids=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1):
"""Performs various post-processing on a word embedding tensor.
Args:
input_tensor: float Tensor of shape [batch_size, seq_length,
embedding_size].
use_token_type: bool. Whether to add embeddings for `token_type_ids`.
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
Must be specified if `use_token_type` is True.
token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
token_type_embedding_name: string. The name of the embedding table variable
for token type ids.
use_position_embeddings: bool. Whether to add position embeddings for the
position of each token in the sequence.
position_embedding_name: string. The name of the embedding table variable
for positional embeddings.
initializer_range: float. Range of the weight initialization.
max_position_embeddings: int. Maximum sequence length that might ever be
used with this model. This can be longer than the sequence length of
input_tensor, but cannot be shorter.
dropout_prob: float. Dropout probability applied to the final output tensor.
Returns:
float tensor with same shape as `input_tensor`.
Raises:
ValueError: One of the tensor shapes or input values is invalid.
"""
input_shape = get_shape_list(input_tensor, expected_rank=3)
batch_size = input_shape[0]
seq_length = input_shape[1]
width = input_shape[2]
output = input_tensor
if use_token_type:
if token_type_ids is None:
raise ValueError("`token_type_ids` must be specified if"
"`use_token_type` is True.")
token_type_table = tf.get_variable(
name=token_type_embedding_name,
shape=[token_type_vocab_size, width],
initializer=create_initializer(initializer_range))
# This vocab will be small so we always do one-hot here, since it is always
# faster for a small vocabulary.
flat_token_type_ids = tf.reshape(token_type_ids, [-1])
one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
token_type_embeddings = tf.reshape(token_type_embeddings,
[batch_size, seq_length, width])
output += token_type_embeddings
if use_position_embeddings:
assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
with tf.control_dependencies([assert_op]):
full_position_embeddings = tf.get_variable(
name=position_embedding_name,
shape=[max_position_embeddings, width],
initializer=create_initializer(initializer_range))
# Since the position embedding table is a learned variable, we create it
# using a (long) sequence length `max_position_embeddings`. The actual
# sequence length might be shorter than this, for faster training of
# tasks that do not have long sequences.
#
# So `full_position_embeddings` is effectively an embedding table
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
# perform a slice.
position_embeddings = tf.slice(full_position_embeddings, [0, 0],
[seq_length, -1])
num_dims = len(output.shape.as_list())
# Only the last two dimensions are relevant (`seq_length` and `width`), so
# we broadcast among the first dimensions, which is typically just
# the batch size.
position_broadcast_shape = []
for _ in range(num_dims - 2):
position_broadcast_shape.append(1)
position_broadcast_shape.extend([seq_length, width])
position_embeddings = tf.reshape(position_embeddings,
position_broadcast_shape)
output += position_embeddings
output = layer_norm_and_dropout(output, dropout_prob)
return output
def create_attention_mask_from_input_mask(from_tensor, to_mask):
"""Create 3D attention mask from a 2D tensor mask.
Args:
from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
to_mask: int32 Tensor of shape [batch_size, to_seq_length].
Returns:
float Tensor of shape [batch_size, from_seq_length, to_seq_length].
"""
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_shape = get_shape_list(to_mask, expected_rank=2)
to_seq_length = to_shape[1]
to_mask = tf.cast(
tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
# We don't assume that `from_tensor` is a mask (although it could be). We
# don't actually care if we attend *from* padding tokens (only *to* padding)
# tokens so we create a tensor of all ones.
#
# `broadcast_ones` = [batch_size, from_seq_length, 1]
broadcast_ones = tf.ones(
shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
# Here we broadcast along two dimensions to create the mask.
mask = broadcast_ones * to_mask
return mask
def attention_layer(from_tensor,
to_tensor,
attention_mask=None,
num_attention_heads=1,
size_per_head=512,
query_act=None,
key_act=None,
value_act=None,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None):
"""Performs multi-headed attention from `from_tensor` to `to_tensor`.
This is an implementation of multi-headed attention based on "Attention
is all you Need". If `from_tensor` and `to_tensor` are the same, then
this is self-attention. Each timestep in `from_tensor` attends to the
corresponding sequence in `to_tensor`, and returns a fixed-with vector.
This function first projects `from_tensor` into a "query" tensor and
`to_tensor` into "key" and "value" tensors. These are (effectively) a list
of tensors of length `num_attention_heads`, where each tensor is of shape
[batch_size, seq_length, size_per_head].
Then, the query and key tensors are dot-producted and scaled. These are
softmaxed to obtain attention probabilities. The value tensors are then
interpolated by these probabilities, then concatenated back to a single
tensor and returned.
In practice, the multi-headed attention are done with transposes and
reshapes rather than actual separate tensors.
Args:
from_tensor: float Tensor of shape [batch_size, from_seq_length,
from_width].
to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
attention_mask: (optional) int32 Tensor of shape [batch_size,
from_seq_length, to_seq_length]. The values should be 1 or 0. The
attention scores will effectively be set to -infinity for any positions in
the mask that are 0, and will be unchanged for positions that are 1.
num_attention_heads: int. Number of attention heads.
size_per_head: int. Size of each attention head.
query_act: (optional) Activation function for the query transform.
key_act: (optional) Activation function for the key transform.
value_act: (optional) Activation function for the value transform.
attention_probs_dropout_prob: (optional) float. Dropout probability of the
attention probabilities.
initializer_range: float. Range of the weight initializer.
do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
* from_seq_length, num_attention_heads * size_per_head]. If False, the
output will be of shape [batch_size, from_seq_length, num_attention_heads
* size_per_head].
batch_size: (Optional) int. If the input is 2D, this might be the batch size
of the 3D version of the `from_tensor` and `to_tensor`.
from_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `from_tensor`.
to_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `to_tensor`.
Returns:
float Tensor of shape [batch_size, from_seq_length,
num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
true, this will be of shape [batch_size * from_seq_length,
num_attention_heads * size_per_head]).
Raises:
ValueError: Any of the arguments or tensor shapes are invalid.
"""
def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
seq_length, width):
output_tensor = tf.reshape(
input_tensor, [batch_size, seq_length, num_attention_heads, width])
output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
return output_tensor
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
if len(from_shape) != len(to_shape):
raise ValueError(
"The rank of `from_tensor` must match the rank of `to_tensor`.")
if len(from_shape) == 3:
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_seq_length = to_shape[1]
elif len(from_shape) == 2:
if (batch_size is None or from_seq_length is None or to_seq_length is None):
raise ValueError(
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified.")
# Scalar dimensions referenced here:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
from_tensor_2d = reshape_to_matrix(from_tensor)
to_tensor_2d = reshape_to_matrix(to_tensor)
# `query_layer` = [B*F, N*H]
query_layer = tf.layers.dense(
from_tensor_2d,
num_attention_heads * size_per_head,
activation=query_act,
name="query",
kernel_initializer=create_initializer(initializer_range))
# `key_layer` = [B*T, N*H]
key_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=key_act,
name="key",
kernel_initializer=create_initializer(initializer_range))
# `value_layer` = [B*T, N*H]
value_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=value_act,
name="value",
kernel_initializer=create_initializer(initializer_range))
# `query_layer` = [B, N, F, H]
query_layer = transpose_for_scores(query_layer, batch_size,
num_attention_heads, from_seq_length,
size_per_head)
# `key_layer` = [B, N, T, H]
key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
to_seq_length, size_per_head)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(size_per_head)))
if attention_mask is not None:
# `attention_mask` = [B, 1, F, T]
attention_mask = tf.expand_dims(attention_mask, axis=[1])
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores += adder
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs = tf.nn.softmax(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
# `value_layer` = [B, T, N, H]
value_layer = tf.reshape(
value_layer,
[batch_size, to_seq_length, num_attention_heads, size_per_head])
# `value_layer` = [B, N, T, H]
value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
# `context_layer` = [B, N, F, H]
context_layer = tf.matmul(attention_probs, value_layer)
# `context_layer` = [B, F, N, H]
context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
if do_return_2d_tensor:
# `context_layer` = [B*F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size * from_seq_length, num_attention_heads * size_per_head])
else:
# `context_layer` = [B, F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size, from_seq_length, num_attention_heads * size_per_head])
return context_layer
def transformer_model(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):
"""Multi-headed, multi-layer Transformer from "Attention is All You Need".
This is almost an exact implementation of the original Transformer encoder.
See the original paper:
https://arxiv.org/abs/1706.03762
Also see:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
Args:
input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
seq_length], with 1 for positions that can be attended to and 0 in
positions that should not be.
hidden_size: int. Hidden size of the Transformer.
num_hidden_layers: int. Number of layers (blocks) in the Transformer.
num_attention_heads: int. Number of attention heads in the Transformer.
intermediate_size: int. The size of the "intermediate" (a.k.a., feed
forward) layer.
intermediate_act_fn: function. The non-linear activation function to apply
to the output of the intermediate/feed-forward layer.
hidden_dropout_prob: float. Dropout probability for the hidden layers.
attention_probs_dropout_prob: float. Dropout probability of the attention
probabilities.
initializer_range: float. Range of the initializer (stddev of truncated
normal).
do_return_all_layers: Whether to also return all layers or just the final
layer.
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size], the final
hidden layer of the Transformer.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
attention_head_size = int(hidden_size / num_attention_heads)
input_shape = get_shape_list(input_tensor, expected_rank=3)
batch_size = input_shape[0]
seq_length = input_shape[1]
input_width = input_shape[2]
# The Transformer performs sum residuals on all layers so the input needs
# to be the same as the hidden size.
if input_width != hidden_size:
raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
(input_width, hidden_size))
# We keep the representation as a 2D tensor to avoid re-shaping it back and
# forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
# the GPU/CPU but may not be free on the TPU, so we want to minimize them to
# help the optimizer.
prev_output = reshape_to_matrix(input_tensor)
all_layer_outputs = []
for layer_idx in range(num_hidden_layers):
with tf.variable_scope("layer_%d" % layer_idx):
layer_input = prev_output
with tf.variable_scope("attention"):
attention_heads = []
with tf.variable_scope("self"):
attention_head = attention_layer(
from_tensor=layer_input,
to_tensor=layer_input,
attention_mask=attention_mask,
num_attention_heads=num_attention_heads,
size_per_head=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
batch_size=batch_size,
from_seq_length=seq_length,
to_seq_length=seq_length)
attention_heads.append(attention_head)
attention_output = None
if len(attention_heads) == 1:
attention_output = attention_heads[0]
else:
# In the case where we have other sequences, we just concatenate
# them to the self-attention head before the projection.
attention_output = tf.concat(attention_heads, axis=-1)
# Run a linear projection of `hidden_size` then add a residual
# with `layer_input`.
with tf.variable_scope("output"):
attention_output = tf.layers.dense(
attention_output,
hidden_size,
kernel_initializer=create_initializer(initializer_range))
attention_output = dropout(attention_output, hidden_dropout_prob)
attention_output = layer_norm(attention_output + layer_input)
# The activation is only applied to the "intermediate" hidden layer.
with tf.variable_scope("intermediate"):
intermediate_output = tf.layers.dense(
attention_output,
intermediate_size,
activation=intermediate_act_fn,
kernel_initializer=create_initializer(initializer_range))
# Down-project back to `hidden_size` then add the residual.
with tf.variable_scope("output"):
layer_output = tf.layers.dense(
intermediate_output,
hidden_size,
kernel_initializer=create_initializer(initializer_range))
layer_output = dropout(layer_output, hidden_dropout_prob)
layer_output = layer_norm(layer_output + attention_output)
prev_output = layer_output
all_layer_outputs.append(layer_output)
if do_return_all_layers:
final_outputs = []
for layer_output in all_layer_outputs:
final_output = reshape_from_matrix(layer_output, input_shape)
final_outputs.append(final_output)
return final_outputs
else:
final_output = reshape_from_matrix(prev_output, input_shape)
return final_output
def get_shape_list(tensor, expected_rank=None, name=None):
"""Returns a list of the shape of tensor, preferring static dimensions.
Args:
tensor: A tf.Tensor object to find the shape of.
expected_rank: (optional) int. The expected rank of `tensor`. If this is
specified and the `tensor` has a different rank, and exception will be
thrown.
name: Optional name of the tensor for the error message.
Returns:
A list of dimensions of the shape of tensor. All static dimensions will
be returned as python integers, and dynamic dimensions will be returned
as tf.Tensor scalars.
"""
if name is None:
name = tensor.name
if expected_rank is not None:
assert_rank(tensor, expected_rank, name)
shape = tensor.shape.as_list()
non_static_indexes = []
for (index, dim) in enumerate(shape):
if dim is None:
non_static_indexes.append(index)
if not non_static_indexes:
return shape
dyn_shape = tf.shape(tensor)
for index in non_static_indexes:
shape[index] = dyn_shape[index]
return shape
def reshape_to_matrix(input_tensor):
"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
ndims = input_tensor.shape.ndims
if ndims < 2:
raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
(input_tensor.shape))
if ndims == 2:
return input_tensor
width = input_tensor.shape[-1]
output_tensor = tf.reshape(input_tensor, [-1, width])
return output_tensor
def reshape_from_matrix(output_tensor, orig_shape_list):
"""Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
if len(orig_shape_list) == 2:
return output_tensor
output_shape = get_shape_list(output_tensor)
orig_dims = orig_shape_list[0:-1]
width = output_shape[-1]
return tf.reshape(output_tensor, orig_dims + [width])
def assert_rank(tensor, expected_rank, name=None):
"""Raises an exception if the tensor rank is not of the expected rank.
Args:
tensor: A tf.Tensor to check the rank of.
expected_rank: Python integer or list of integers, expected rank.
name: Optional name of the tensor for the error message.
Raises:
ValueError: If the expected shape doesn't match the actual shape.
"""
if name is None:
name = tensor.name
expected_rank_dict = {}
if isinstance(expected_rank, six.integer_types):
expected_rank_dict[expected_rank] = True
else:
for x in expected_rank:
expected_rank_dict[x] = True
actual_rank = tensor.shape.ndims
if actual_rank not in expected_rank_dict:
scope_name = tf.get_variable_scope().name
raise ValueError(
"For the tensor `%s` in scope `%s`, the actual rank "
"`%d` (shape = %s) is not equal to the expected rank `%s`" %
(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
================================================
FILE: bert/modeling_test.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import random
import re
from bert import modeling
import six
import tensorflow as tf
class BertModelTest(tf.test.TestCase):
class BertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
scope=None):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.scope = scope
def create_model(self):
input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = BertModelTest.ids_tensor(
[self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = BertModelTest.ids_tensor(
[self.batch_size, self.seq_length], self.type_vocab_size)
config = modeling.BertConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range)
model = modeling.BertModel(
config=config,
is_training=self.is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=token_type_ids,
scope=self.scope)
outputs = {
"embedding_output": model.get_embedding_output(),
"sequence_output": model.get_sequence_output(),
"pooled_output": model.get_pooled_output(),
"all_encoder_layers": model.get_all_encoder_layers(),
}
return outputs
def check_output(self, result):
self.parent.assertAllEqual(
result["embedding_output"].shape,
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertAllEqual(
result["sequence_output"].shape,
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertAllEqual(result["pooled_output"].shape,
[self.batch_size, self.hidden_size])
def test_default(self):
self.run_tester(BertModelTest.BertModelTester(self))
def test_config_to_json_string(self):
config = modeling.BertConfig(vocab_size=99, hidden_size=37)
obj = json.loads(config.to_json_string())
self.assertEqual(obj["vocab_size"], 99)
self.assertEqual(obj["hidden_size"], 37)
def run_tester(self, tester):
with self.test_session() as sess:
ops = tester.create_model()
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
output_result = sess.run(ops)
tester.check_output(output_result)
self.assert_all_tensors_reachable(sess, [init_op, ops])
@classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
def assert_all_tensors_reachable(self, sess, outputs):
"""Checks that all the tensors in the graph are reachable from outputs."""
graph = sess.graph
ignore_strings = [
"^.*/assert_less_equal/.*$",
"^.*/dilation_rate$",
"^.*/Tensordot/concat$",
"^.*/Tensordot/concat/axis$",
"^testing/.*$",
]
ignore_regexes = [re.compile(x) for x in ignore_strings]
unreachable = self.get_unreachable_ops(graph, outputs)
filtered_unreachable = []
for x in unreachable:
do_ignore = False
for r in ignore_regexes:
m = r.match(x.name)
if m is not None:
do_ignore = True
if do_ignore:
continue
filtered_unreachable.append(x)
unreachable = filtered_unreachable
self.assertEqual(
len(unreachable), 0, "The following ops are unreachable: %s" %
(" ".join([x.name for x in unreachable])))
@classmethod
def get_unreachable_ops(cls, graph, outputs):
"""Finds all of the tensors in graph that are unreachable from outputs."""
outputs = cls.flatten_recursive(outputs)
output_to_op = collections.defaultdict(list)
op_to_all = collections.defaultdict(list)
assign_out_to_in = collections.defaultdict(list)
for op in graph.get_operations():
for x in op.inputs:
op_to_all[op.name].append(x.name)
for y in op.outputs:
output_to_op[y.name].append(op.name)
op_to_all[op.name].append(y.name)
if str(op.type) == "Assign":
for y in op.outputs:
for x in op.inputs:
assign_out_to_in[y.name].append(x.name)
assign_groups = collections.defaultdict(list)
for out_name in assign_out_to_in.keys():
name_group = assign_out_to_in[out_name]
for n1 in name_group:
assign_groups[n1].append(out_name)
for n2 in name_group:
if n1 != n2:
assign_groups[n1].append(n2)
seen_tensors = {}
stack = [x.name for x in outputs]
while stack:
name = stack.pop()
if name in seen_tensors:
continue
seen_tensors[name] = True
if name in output_to_op:
for op_name in output_to_op[name]:
if op_name in op_to_all:
for input_name in op_to_all[op_name]:
if input_name not in stack:
stack.append(input_name)
expanded_names = []
if name in assign_groups:
for assign_name in assign_groups[name]:
expanded_names.append(assign_name)
for expanded_name in expanded_names:
if expanded_name not in stack:
stack.append(expanded_name)
unreachable_ops = []
for op in graph.get_operations():
is_unreachable = False
all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
for name in all_names:
if name not in seen_tensors:
is_unreachable = True
if is_unreachable:
unreachable_ops.append(op)
return unreachable_ops
@classmethod
def flatten_recursive(cls, item):
"""Flattens (potentially nested) a tuple/dictionary/list to a list."""
output = []
if isinstance(item, list):
output.extend(item)
elif isinstance(item, tuple):
output.extend(list(item))
elif isinstance(item, dict):
for (_, v) in six.iteritems(item):
output.append(v)
else:
return [item]
flat_output = []
for x in output:
flat_output.extend(cls.flatten_recursive(x))
return flat_output
if __name__ == "__main__":
tf.test.main()
================================================
FILE: bert/multilingual.md
================================================
## Models
There are two multilingual models currently available. We do not plan to release
more single-language models, but we may release `BERT-Large` versions of these
two in the future:
* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
**The `Multilingual Cased (New)` model also fixes normalization issues in many
languages, so it is recommended in languages with non-Latin alphabets (and is
often better for most languages with Latin alphabets). When using this model,
make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other
scripts.**
See the [list of languages](#list-of-languages) that the Multilingual model
supports. The Multilingual model does include Chinese (and English), but if your
fine-tuning data is Chinese-only, then the Chinese model will likely produce
better results.
## Results
To evaluate these systems, we use the
[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a
version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the
dev and test sets have been translated (by humans) into 15 languages. Note that
the training set was *machine* translated (we used the translations provided by
XNLI, not Google NMT). For clarity, we only report on 6 languages below:
<!-- mdformat off(no table) -->
| System | English | Chinese | Spanish | German | Arabic | Urdu |
| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- |
| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 |
| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 |
| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 |
| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 |
| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** |
| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 |
<!-- mdformat on -->
The first two rows are baselines from the XNLI paper and the last three rows are
our results with BERT.
**Translate Train** means that the MultiNLI training set was machine translated
from English into the foreign language. So training and evaluation were both
done in the foreign language. Unfortunately, training was done on
machine-translated data, so it is impossible to quantify how much of the lower
accuracy (compared to English) is due to the quality of the machine translation
vs. the quality of the pre-trained model.
**Translate Test** means that the XNLI test set was machine translated from the
foreign language into English. So training and evaluation were both done on
English. However, test evaluation was done on machine-translated English, so the
accuracy depends on the quality of the machine translation system.
**Zero Shot** means that the Multilingual BERT system was fine-tuned on English
MultiNLI, and then evaluated on the foreign language XNLI test. In this case,
machine translation was not involved at all in either the pre-training or
fine-tuning.
Note that the English result is worse than the 84.2 MultiNLI baseline because
this training used Multilingual BERT rather than English-only BERT. This implies
that for high-resource languages, the Multilingual model is somewhat worse than
a single-language model. However, it is not feasible for us to train and
maintain dozens of single-language model. Therefore, if your goal is to maximize
performance with a language other than English or Chinese, you might find it
beneficial to run pre-training for additional steps starting from our
Multilingual model on data from your language of interest.
Here is a comparison of training Chinese models with the Multilingual
`BERT-Base` and Chinese-only `BERT-Base`:
System | Chinese
----------------------- | -------
XNLI Baseline | 67.0
BERT Multilingual Model | 74.2
BERT Chinese-only Model | 77.2
Similar to English, the single-language model does 3% better than the
Multilingual model.
## Fine-tuning Example
The multilingual model does **not** require any special consideration or API
changes. We did update the implementation of `BasicTokenizer` in
`tokenization.py` to support Chinese character tokenization, so please update if
you forked it. However, we did not change the tokenization API.
To test the new models, we did modify `run_classifier.py` to add support for the
[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language
version of MultiNLI where the dev/test sets have been human-translated, and the
training set has been machine-translated.
To run the fine-tuning code, please download the
[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the
[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip)
and then unpack both .zip files into some directory `$XNLI_DIR`.
To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py`
(Chinese by default), so please modify `XnliProcessor` if you want to run on
another language.
This is a large dataset, so this will training will take a few hours on a GPU
(or about 30 minutes on a Cloud TPU). To run an experiment quickly for
debugging, just set `num_train_epochs` to a small value like `0.1`.
```shell
export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12
export XNLI_DIR=/path/to/xnli
python run_classifier.py \
--task_name=XNLI \
--do_train=true \
--do_eval=true \
--data_dir=$XNLI_DIR \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--max_seq_length=128 \
--train_batch_size=32 \
--learning_rate=5e-5 \
--num_train_epochs=2.0 \
--output_dir=/tmp/xnli_output/
```
With the Chinese-only model, the results should look something like this:
```
***** Eval results *****
eval_accuracy = 0.774116
eval_loss = 0.83554
global_step = 24543
loss = 0.74603
```
## Details
### Data Source and Sampling
The languages chosen were the
[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).
The entire Wikipedia dump for each language (excluding user and talk pages) was
taken as the training data for each language
However, the size of the Wikipedia for a given language varies greatly, and
therefore low-resource languages may be "under-represented" in terms of the
neural network model (under the assumption that languages are "competing" for
limited model capacity to some extent).
However, the size of a Wikipedia also correlates with the number of speakers of
a language, and we also don't want to overfit the model by performing thousands
of epochs over a tiny Wikipedia for a particular language.
To balance these two factors, we performed exponentially smoothed weighting of
the data during pre-training data creation (and WordPiece vocab creation). In
other words, let's say that the probability of a language is *P(L)*, e.g.,
*P(English) = 0.21* means that after concatenating all of the Wikipedias
together, 21% of our data is English. We exponentiate each probability by some
factor *S* and then re-normalize, and sample from that distribution. In our case
we use *S=0.7*. So, high-resource languages like English will be under-sampled,
and low-resource languages like Icelandic will be over-sampled. E.g., in the
original distribution English would be sampled 1000x more than Icelandic, but
after smoothing it's only sampled 100x more.
### Tokenization
For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are
weighted the same way as the data, so low-resource languages are upweighted by
some factor. We intentionally do *not* use any marker to denote the input
language (so that zero-shot training can work).
Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
characters, we add spaces around every character in the
[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\))
before applying WordPiece. This means that Chinese is effectively
character-tokenized. Note that the CJK Unicode block only includes
Chinese-origin characters and does *not* include Hangul Korean or
Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like
all other languages.
For all other languages, we apply the
[same recipe as English](https://github.com/google-research/bert#tokenization):
(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace
tokenization. We understand that accent markers have substantial meaning in some
languages, but felt that the benefits of reducing the effective vocabulary make
up for this. Generally the strong contextual models of BERT should make up for
any ambiguity introduced by stripping accent markers.
### List of Languages
The multilingual model supports the following languages. These languages were
chosen because they are the top 100 languages with the largest Wikipedias:
* Afrikaans
* Albanian
* Arabic
* Aragonese
* Armenian
* Asturian
* Azerbaijani
* Bashkir
* Basque
* Bavarian
* Belarusian
* Bengali
* Bishnupriya Manipuri
* Bosnian
* Breton
* Bulgarian
* Burmese
* Catalan
* Cebuano
* Chechen
* Chinese (Simplified)
* Chinese (Traditional)
* Chuvash
* Croatian
* Czech
* Danish
* Dutch
* English
* Estonian
* Finnish
* French
* Galician
* Georgian
* German
* Greek
* Gujarati
* Haitian
* Hebrew
* Hindi
* Hungarian
* Icelandic
* Ido
* Indonesian
* Irish
* Italian
* Japanese
* Javanese
* Kannada
* Kazakh
* Kirghiz
* Korean
* Latin
* Latvian
* Lithuanian
* Lombard
* Low Saxon
* Luxembourgish
* Macedonian
* Malagasy
* Malay
* Malayalam
* Marathi
* Minangkabau
* Nepali
* Newar
* Norwegian (Bokmal)
* Norwegian (Nynorsk)
* Occitan
* Persian (Farsi)
* Piedmontese
* Polish
* Portuguese
* Punjabi
* Romanian
* Russian
* Scots
* Serbian
* Serbo-Croatian
* Sicilian
* Slovak
* Slovenian
* South Azerbaijani
* Spanish
* Sundanese
* Swahili
* Swedish
* Tagalog
* Tajik
* Tamil
* Tatar
* Telugu
* Turkish
* Ukrainian
* Urdu
* Uzbek
* Vietnamese
* Volapük
* Waray-Waray
* Welsh
* West Frisian
* Western Punjabi
* Yoruba
The **Multilingual Cased (New)** release contains additionally **Thai** and
**Mongolian**, which were not included in the original release.
================================================
FILE: bert/optimization.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# Standard Adam update.
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want ot decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param
update_with_lr = self.learning_rate * update
next_param = param - update_with_lr
assignments.extend(
[param.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
================================================
FILE: bert/optimization_test.py
================================================
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from bert import optimization
import tensorflow as tf
class OptimizationTest(tf.test.TestCase):
def test_adam(self):
with self.test_session() as sess:
w = tf.get_variable(
"w",
shape=[3],
initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
x = tf.constant([0.4, 0.2, -0.5])
loss = tf.reduce_mean(tf.square(x - w))
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
global_step = tf.train.get_or_create_global_step()
optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
for _ in range(100):
sess.run(train_op)
w_np = sess.run(w)
self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
if __name__ == "__main__":
tf.test.main()
================================================
FILE: bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Predicting Movie Reviews with BERT on TF Hub.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"metadata": {
"id": "j0a4mTk9o1Qg",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# Copyright 2019 Google Inc.\n",
"\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "dCpvgG0vwXAZ",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"#Predicting Movie Review Sentiment with BERT on TF Hub"
]
},
{
"metadata": {
"id": "xiYrZKaHwV81",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n",
"\n",
"Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n",
"\n",
"Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!"
]
},
{
"metadata": {
"id": "hsZvic2YxnTz",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"from datetime import datetime"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "cp5wfXDx5SPH",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"In addition to the standard libraries we imported above, we'll need to install BERT's python package."
]
},
{
"metadata": {
"id": "jviywGyWyKsA",
"colab_type": "code",
"outputId": "166f3005-d219-404f-b201-2a0b75480360",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"cell_type": "code",
"source": [
"!pip install bert-tensorflow"
],
"execution_count": 38,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.6/dist-packages (1.0.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.11.0)\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "hhbGEfwgdEtw",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"import bert\n",
"from bert import run_classifier\n",
"from bert import optimization\n",
"from bert import tokenization"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "KVB3eOcjxxm1",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n",
"\n",
"Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n",
"\n",
"Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)."
]
},
{
"metadata": {
"id": "US_EAnICvP7f",
"colab_type": "code",
"outputId": "7780a032-31d4-4794-e6aa-664a5d2ae7dd",
"cellView": "form",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"cell_type": "code",
"source": [
"# Set the output directory for saving model file\n",
"# Optionally, set a GCP bucket location\n",
"\n",
"OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:\"string\"}\n",
"#@markdown Whether or not to clear/delete the directory and create a new one\n",
"DO_DELETE = False #@param {type:\"boolean\"}\n",
"#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n",
"USE_BUCKET = True #@param {type:\"boolean\"}\n",
"BUCKET = 'BUCKET_NAME' #@param {type:\"string\"}\n",
"\n",
"if USE_BUCKET:\n",
" OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n",
" from google.colab import auth\n",
" auth.authenticate_user()\n",
"\n",
"if DO_DELETE:\n",
" try:\n",
" tf.gfile.DeleteRecursively(OUTPUT_DIR)\n",
" except:\n",
" # Doesn't matter if the directory didn't exist\n",
" pass\n",
"tf.gfile.MakeDirs(OUTPUT_DIR)\n",
"print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n"
],
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"text": [
"***** Model output directory: gs://bert-tfhub/aclImdb_v1 *****\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "pmFYvkylMwXn",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"#Data"
]
},
{
"metadata": {
"id": "MC_w8SRqN0fr",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)."
]
},
{
"metadata": {
"id": "fom_ff20gyy6",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from tensorflow import keras\n",
"import os\n",
"import re\n",
"\n",
"# Load all files from a directory in a DataFrame.\n",
"def load_directory_data(directory):\n",
" data = {}\n",
" data[\"sentence\"] = []\n",
" data[\"sentiment\"] = []\n",
" for file_path in os.listdir(directory):\n",
" with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n",
" data[\"sentence\"].append(f.read())\n",
" data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n",
" return pd.DataFrame.from_dict(data)\n",
"\n",
"# Merge positive and negative examples, add a polarity column and shuffle.\n",
"def load_dataset(directory):\n",
" pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n",
" neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n",
" pos_df[\"polarity\"] = 1\n",
" neg_df[\"polarity\"] = 0\n",
" return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n",
"\n",
"# Download and process the dataset files.\n",
"def download_and_load_datasets(force_download=False):\n",
" dataset = tf.keras.utils.get_file(\n",
" fname=\"aclImdb.tar.gz\", \n",
" origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n",
" extract=True)\n",
" \n",
" train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
" \"aclImdb\", \"train\"))\n",
" test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n",
" \"aclImdb\", \"test\"))\n",
" \n",
" return train_df, test_df\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "2abfwdn-g135",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train, test = download_and_load_datasets()"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "XA8WHJgzhIZf",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"To keep training fast, we'll take a sample of 5000 train and test examples, respectively."
]
},
{
"metadata": {
"id": "lw_F488eixTV",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train = train.sample(5000)\n",
"test = test.sample(5000)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "prRQM8pDi8xI",
"colab_type": "code",
"outputId": "34445cb8-2be0-4379-fdbc-7794091f6049",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"cell_type": "code",
"source": [
"train.columns"
],
"execution_count": 44,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['sentence', 'sentiment', 'polarity'], dtype='object')"
]
},
"metadata": {
"tags": []
},
"execution_count": 44
}
]
},
{
"metadata": {
"id": "sfRnHSz3iSXz",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)"
]
},
{
"metadata": {
"id": "IuMOGwFui4it",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"DATA_COLUMN = 'sentence'\n",
"LABEL_COLUMN = 'polarity'\n",
"# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n",
"label_list = [0, 1]"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "V399W0rqNJ-Z",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"#Data Preprocessing\n",
"We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n",
"\n",
"- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n",
"- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n",
"- `label` is the label for our example, i.e. True, False"
]
},
{
"metadata": {
"id": "p9gEt5SmM6i6",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
"train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n",
" text_a = x[DATA_COLUMN], \n",
" text_b = None, \n",
" label = x[LABEL_COLUMN]), axis = 1)\n",
"\n",
"test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n",
" text_a = x[DATA_COLUMN], \n",
" text_b = None, \n",
" label = x[LABEL_COLUMN]), axis = 1)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "SCZWZtKxObjh",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n",
"\n",
"\n",
"1. Lowercase our text (if we're using a BERT lowercase model)\n",
"2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n",
"3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n",
"4. Map our words to indexes using a vocab file that BERT provides\n",
"5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n",
"6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n",
"\n",
"Happily, we don't have to worry about most of these details.\n",
"\n",
"\n"
]
},
{
"metadata": {
"id": "qMWiDtpyQSoU",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:"
]
},
{
"metadata": {
"id": "IhJSe0QHNG7U",
"colab_type": "code",
"outputId": "20b28cc7-3cb3-4ce6-bfff-a7847ce3bbaa",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"cell_type": "code",
"source": [
"# This is a path to an uncased (all lowercase) version of BERT\n",
"BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n",
"\
gitextract_xilhx8ju/
├── .idea/
│ ├── Cail2019_track2.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── bert/
│ ├── .gitignore
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── create_pretraining_data.py
│ ├── extract_features.py
│ ├── modeling.py
│ ├── modeling_test.py
│ ├── multilingual.md
│ ├── optimization.py
│ ├── optimization_test.py
│ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
│ ├── requirements.txt
│ ├── run_classifier.py
│ ├── run_classifier_with_tfhub.py
│ ├── run_pretraining.py
│ ├── run_squad.py
│ ├── sample_text.txt
│ ├── tokenization.py
│ └── tokenization_test.py
├── convert.py
├── createPretrainData.py
├── data/
│ ├── divorce/
│ │ ├── data_small_selected.json
│ │ ├── tags.txt
│ │ └── train_selected.json
│ ├── labor/
│ │ ├── data_small_selected.json
│ │ ├── tags.txt
│ │ └── train_selected.json
│ └── loan/
│ ├── data_small_selected.json
│ ├── tags.txt
│ └── train_selected.json
├── evaluation.py
├── genPretrainData.py
├── run_pretrain.py
├── search_threshold.py
├── train.py
└── utils/
├── __init__.py
├── ckpt2pb.py
├── evaluate.py
├── models.py
└── predict.py
SYMBOL INDEX (271 symbols across 22 files)
FILE: bert/create_pretraining_data.py
class TrainingInstance (line 64) | class TrainingInstance(object):
method __init__ (line 67) | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm...
method __str__ (line 75) | def __str__(self):
method __repr__ (line 88) | def __repr__(self):
function write_instance_to_example_files (line 92) | def write_instance_to_example_files(instances, tokenizer, max_seq_length,
function create_int_feature (line 165) | def create_int_feature(values):
function create_float_feature (line 170) | def create_float_feature(values):
function create_training_instances (line 175) | def create_training_instances(input_files, tokenizer, max_seq_length,
function create_instances_from_document (line 219) | def create_instances_from_document(
function create_masked_lm_predictions (line 338) | def create_masked_lm_predictions(tokens, masked_lm_prob,
function truncate_seq_pair (line 391) | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
function main (line 409) | def main(_):
FILE: bert/extract_features.py
class InputExample (line 81) | class InputExample(object):
method __init__ (line 83) | def __init__(self, unique_id, text_a, text_b):
class InputFeatures (line 89) | class InputFeatures(object):
method __init__ (line 92) | def __init__(self, unique_id, tokens, input_ids, input_mask, input_typ...
function input_fn_builder (line 100) | def input_fn_builder(features, seq_length):
function model_fn_builder (line 148) | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
function convert_examples_to_features (line 210) | def convert_examples_to_features(examples, seq_length, tokenizer):
function _truncate_seq_pair (line 302) | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
function read_examples (line 319) | def read_examples(input_file):
function main (line 343) | def main(_):
FILE: bert/modeling.py
class BertConfig (line 31) | class BertConfig(object):
method __init__ (line 34) | def __init__(self,
method from_dict (line 83) | def from_dict(cls, json_object):
method from_json_file (line 91) | def from_json_file(cls, json_file):
method to_dict (line 97) | def to_dict(self):
method to_json_string (line 102) | def to_json_string(self):
class BertModel (line 107) | class BertModel(object):
method __init__ (line 131) | def __init__(self,
method get_pooled_output (line 234) | def get_pooled_output(self):
method get_sequence_output (line 237) | def get_sequence_output(self):
method get_all_encoder_layers (line 246) | def get_all_encoder_layers(self):
method get_embedding_output (line 249) | def get_embedding_output(self):
method get_embedding_table (line 260) | def get_embedding_table(self):
function gelu (line 264) | def gelu(x):
function get_activation (line 280) | def get_activation(activation_string):
function get_assignment_map_from_checkpoint (line 317) | def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
function dropout (line 344) | def dropout(input_tensor, dropout_prob):
function layer_norm (line 362) | def layer_norm(input_tensor, name=None):
function layer_norm_and_dropout (line 368) | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
function create_initializer (line 375) | def create_initializer(initializer_range=0.02):
function embedding_lookup (line 380) | def embedding_lookup(input_ids,
function embedding_postprocessor (line 428) | def embedding_postprocessor(input_tensor,
function create_attention_mask_from_input_mask (line 524) | def create_attention_mask_from_input_mask(from_tensor, to_mask):
function attention_layer (line 558) | def attention_layer(from_tensor,
function transformer_model (line 754) | def transformer_model(input_tensor,
function get_shape_list (line 895) | def get_shape_list(tensor, expected_rank=None, name=None):
function reshape_to_matrix (line 932) | def reshape_to_matrix(input_tensor):
function reshape_from_matrix (line 946) | def reshape_from_matrix(output_tensor, orig_shape_list):
function assert_rank (line 959) | def assert_rank(tensor, expected_rank, name=None):
FILE: bert/modeling_test.py
class BertModelTest (line 29) | class BertModelTest(tf.test.TestCase):
class BertModelTester (line 31) | class BertModelTester(object):
method __init__ (line 33) | def __init__(self,
method create_model (line 71) | def create_model(self):
method check_output (line 114) | def check_output(self, result):
method test_default (line 126) | def test_default(self):
method test_config_to_json_string (line 129) | def test_config_to_json_string(self):
method run_tester (line 135) | def run_tester(self, tester):
method ids_tensor (line 147) | def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
method assert_all_tensors_reachable (line 162) | def assert_all_tensors_reachable(self, sess, outputs):
method get_unreachable_ops (line 194) | def get_unreachable_ops(cls, graph, outputs):
method flatten_recursive (line 257) | def flatten_recursive(cls, item):
FILE: bert/optimization.py
function create_optimizer (line 25) | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, u...
class AdamWeightDecayOptimizer (line 87) | class AdamWeightDecayOptimizer(tf.train.Optimizer):
method __init__ (line 90) | def __init__(self,
method apply_gradients (line 108) | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
method _do_use_weight_decay (line 159) | def _do_use_weight_decay(self, param_name):
method _get_variable_name (line 169) | def _get_variable_name(self, param_name):
FILE: bert/optimization_test.py
class OptimizationTest (line 23) | class OptimizationTest(tf.test.TestCase):
method test_adam (line 25) | def test_adam(self):
FILE: bert/run_classifier.py
class InputExample (line 127) | class InputExample(object):
method __init__ (line 130) | def __init__(self, guid, text_a, text_b=None, label=None):
class PaddingInputExample (line 148) | class PaddingInputExample(object):
class InputFeatures (line 161) | class InputFeatures(object):
method __init__ (line 164) | def __init__(self,
class DataProcessor (line 177) | class DataProcessor(object):
method get_train_examples (line 180) | def get_train_examples(self, data_dir):
method get_dev_examples (line 184) | def get_dev_examples(self, data_dir):
method get_test_examples (line 188) | def get_test_examples(self, data_dir):
method get_labels (line 192) | def get_labels(self):
method _read_tsv (line 197) | def _read_tsv(cls, input_file, quotechar=None):
class XnliProcessor (line 207) | class XnliProcessor(DataProcessor):
method __init__ (line 210) | def __init__(self):
method get_train_examples (line 213) | def get_train_examples(self, data_dir):
method get_dev_examples (line 232) | def get_dev_examples(self, data_dir):
method get_labels (line 250) | def get_labels(self):
class MnliProcessor (line 255) | class MnliProcessor(DataProcessor):
method get_train_examples (line 258) | def get_train_examples(self, data_dir):
method get_dev_examples (line 263) | def get_dev_examples(self, data_dir):
method get_test_examples (line 269) | def get_test_examples(self, data_dir):
method get_labels (line 274) | def get_labels(self):
method _create_examples (line 278) | def _create_examples(self, lines, set_type):
class MrpcProcessor (line 296) | class MrpcProcessor(DataProcessor):
method get_train_examples (line 299) | def get_train_examples(self, data_dir):
method get_dev_examples (line 304) | def get_dev_examples(self, data_dir):
method get_test_examples (line 309) | def get_test_examples(self, data_dir):
method get_labels (line 314) | def get_labels(self):
method _create_examples (line 318) | def _create_examples(self, lines, set_type):
class ColaProcessor (line 336) | class ColaProcessor(DataProcessor):
method get_train_examples (line 339) | def get_train_examples(self, data_dir):
method get_dev_examples (line 344) | def get_dev_examples(self, data_dir):
method get_test_examples (line 349) | def get_test_examples(self, data_dir):
method get_labels (line 354) | def get_labels(self):
method _create_examples (line 358) | def _create_examples(self, lines, set_type):
function convert_single_example (line 377) | def convert_single_example(ex_index, example, label_list, max_seq_length,
function file_based_convert_examples_to_features (line 479) | def file_based_convert_examples_to_features(
function file_based_input_fn_builder (line 509) | def file_based_input_fn_builder(input_file, seq_length, is_training,
function _truncate_seq_pair (line 557) | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
function create_model (line 574) | def create_model(bert_config, is_training, input_ids, input_mask, segmen...
function model_fn_builder (line 619) | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_...
function input_fn_builder (line 713) | def input_fn_builder(features, seq_length, is_training, drop_remainder):
function convert_examples_to_features (line 767) | def convert_examples_to_features(examples, label_list, max_seq_length,
function main (line 783) | def main(_):
FILE: bert/run_classifier_with_tfhub.py
function create_model (line 37) | def create_model(is_training, input_ids, input_mask, segment_ids, labels,
function model_fn_builder (line 87) | def model_fn_builder(num_labels, learning_rate, num_train_steps,
function create_tokenizer_from_hub_module (line 146) | def create_tokenizer_from_hub_module(bert_hub_module_handle):
function main (line 158) | def main(_):
FILE: bert/run_pretraining.py
function model_fn_builder (line 109) | def model_fn_builder(bert_config, init_checkpoint, learning_rate,
function get_masked_lm_output (line 240) | def get_masked_lm_output(bert_config, input_tensor, output_weights, posi...
function get_next_sentence_output (line 285) | def get_next_sentence_output(bert_config, input_tensor, labels):
function gather_indexes (line 308) | def gather_indexes(sequence_tensor, positions):
function input_fn_builder (line 324) | def input_fn_builder(input_files,
function _decode_record (line 391) | def _decode_record(record, name_to_features):
function main (line 406) | def main(_):
FILE: bert/run_squad.py
class SquadExample (line 157) | class SquadExample(object):
method __init__ (line 163) | def __init__(self,
method __str__ (line 179) | def __str__(self):
method __repr__ (line 182) | def __repr__(self):
class InputFeatures (line 197) | class InputFeatures(object):
method __init__ (line 200) | def __init__(self,
function read_squad_examples (line 227) | def read_squad_examples(input_file, is_training):
function convert_examples_to_features (line 309) | def convert_examples_to_features(examples, tokenizer, max_seq_length,
function _improve_answer_span (line 476) | def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
function _check_is_max_context (line 513) | def _check_is_max_context(doc_spans, cur_span_index, position):
function create_model (line 550) | def create_model(bert_config, is_training, input_ids, input_mask, segmen...
function model_fn_builder (line 590) | def model_fn_builder(bert_config, init_checkpoint, learning_rate,
function input_fn_builder (line 687) | def input_fn_builder(input_file, seq_length, is_training, drop_remainder):
function write_predictions (line 741) | def write_predictions(all_examples, all_features, all_results, n_best_size,
function get_final_text (line 927) | def get_final_text(pred_text, orig_text, do_lower_case):
function _get_best_indexes (line 1023) | def _get_best_indexes(logits, n_best_size):
function _compute_softmax (line 1035) | def _compute_softmax(scores):
class FeatureWriter (line 1058) | class FeatureWriter(object):
method __init__ (line 1061) | def __init__(self, filename, is_training):
method process_feature (line 1067) | def process_feature(self, feature):
method close (line 1093) | def close(self):
function validate_flags_or_throw (line 1097) | def validate_flags_or_throw(bert_config):
function main (line 1126) | def main(_):
FILE: bert/tokenization.py
function validate_case_matches_checkpoint (line 28) | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
function convert_to_unicode (line 78) | def convert_to_unicode(text):
function printable_text (line 98) | def printable_text(text):
function load_vocab (line 121) | def load_vocab(vocab_file):
function convert_by_vocab (line 136) | def convert_by_vocab(vocab, items):
function convert_tokens_to_ids (line 144) | def convert_tokens_to_ids(vocab, tokens):
function convert_ids_to_tokens (line 148) | def convert_ids_to_tokens(inv_vocab, ids):
function whitespace_tokenize (line 152) | def whitespace_tokenize(text):
class FullTokenizer (line 161) | class FullTokenizer(object):
method __init__ (line 164) | def __init__(self, vocab_file, do_lower_case=True):
method tokenize (line 170) | def tokenize(self, text):
method convert_tokens_to_ids (line 178) | def convert_tokens_to_ids(self, tokens):
method convert_ids_to_tokens (line 181) | def convert_ids_to_tokens(self, ids):
class BasicTokenizer (line 185) | class BasicTokenizer(object):
method __init__ (line 188) | def __init__(self, do_lower_case=True):
method tokenize (line 196) | def tokenize(self, text):
method _run_strip_accents (line 220) | def _run_strip_accents(self, text):
method _run_split_on_punc (line 231) | def _run_split_on_punc(self, text):
method _tokenize_chinese_chars (line 251) | def _tokenize_chinese_chars(self, text):
method _is_chinese_char (line 264) | def _is_chinese_char(self, cp):
method _clean_text (line 286) | def _clean_text(self, text):
class WordpieceTokenizer (line 300) | class WordpieceTokenizer(object):
method __init__ (line 303) | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=...
method tokenize (line 308) | def tokenize(self, text):
function _is_whitespace (line 362) | def _is_whitespace(char):
function _is_control (line 374) | def _is_control(char):
function _is_punctuation (line 386) | def _is_punctuation(char):
FILE: bert/tokenization_test.py
class TokenizationTest (line 26) | class TokenizationTest(tf.test.TestCase):
method test_full_tokenizer (line 28) | def test_full_tokenizer(self):
method test_chinese (line 51) | def test_chinese(self):
method test_basic_tokenizer_lower (line 58) | def test_basic_tokenizer_lower(self):
method test_basic_tokenizer_no_lower (line 66) | def test_basic_tokenizer_no_lower(self):
method test_wordpiece_tokenizer (line 73) | def test_wordpiece_tokenizer(self):
method test_convert_tokens_to_ids (line 93) | def test_convert_tokens_to_ids(self):
method test_is_whitespace (line 107) | def test_is_whitespace(self):
method test_is_control (line 117) | def test_is_control(self):
method test_is_punctuation (line 126) | def test_is_punctuation(self):
FILE: createPretrainData.py
class TrainingInstance (line 78) | class TrainingInstance(object):
method __init__ (line 81) | def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm...
method __str__ (line 89) | def __str__(self):
method __repr__ (line 102) | def __repr__(self):
function write_instance_to_example_files (line 106) | def write_instance_to_example_files(instances, tokenizer, max_seq_length,
function create_int_feature (line 179) | def create_int_feature(values):
function create_float_feature (line 184) | def create_float_feature(values):
function create_training_instances (line 189) | def create_training_instances(input_files, tokenizer, max_seq_length,
function create_instances_from_document (line 233) | def create_instances_from_document(
function create_masked_lm_predictions (line 352) | def create_masked_lm_predictions(tokens, masked_lm_prob,
function truncate_seq_pair (line 405) | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
function main (line 423) | def main(_):
FILE: evaluation.py
function readThreshold (line 14) | def readThreshold(fname):
function add_ (line 20) | def add_(arr1, arr2):
function re_match (line 26) | def re_match(text, feature):
function getMatch (line 34) | def getMatch(feature, sentences, model):
function load_file (line 42) | def load_file(filename):
FILE: genPretrainData.py
function getSentences (line 3) | def getSentences(filename):
function work (line 15) | def work(task):
FILE: run_pretrain.py
function model_fn_builder (line 119) | def model_fn_builder(bert_config, init_checkpoint, learning_rate,
function get_masked_lm_output (line 250) | def get_masked_lm_output(bert_config, input_tensor, output_weights, posi...
function get_next_sentence_output (line 295) | def get_next_sentence_output(bert_config, input_tensor, labels):
function gather_indexes (line 318) | def gather_indexes(sequence_tensor, positions):
function input_fn_builder (line 334) | def input_fn_builder(input_files,
function _decode_record (line 401) | def _decode_record(record, name_to_features):
function main (line 416) | def main(_):
FILE: search_threshold.py
function getLab (line 14) | def getLab(probs, id2label, threshold):
function getPreLab (line 21) | def getPreLab(array, id2label, threshold):
function load_file (line 27) | def load_file(filename):
function searchThreshold (line 39) | def searchThreshold(domain, model_pb, threshold_dir,
FILE: train.py
function getNumClasses (line 171) | def getNumClasses():
class InputExample (line 181) | class InputExample(object):
method __init__ (line 184) | def __init__(self, guid, text_a, text_b=None, label=None):
class PaddingInputExample (line 202) | class PaddingInputExample(object):
class InputFeatures (line 215) | class InputFeatures(object):
method __init__ (line 218) | def __init__(self,
class DataProcessor (line 231) | class DataProcessor(object):
method get_train_examples (line 234) | def get_train_examples(self, data_dir):
method get_dev_examples (line 238) | def get_dev_examples(self, data_dir):
method get_test_examples (line 242) | def get_test_examples(self, data_dir):
method get_labels (line 246) | def get_labels(self):
method _read_json (line 251) | def _read_json(cls, input_file):
class MultilabelClassfier (line 265) | class MultilabelClassfier(DataProcessor):
method get_train_examples (line 268) | def get_train_examples(self, data_dir):
method get_dev_examples (line 274) | def get_dev_examples(self, data_dir):
method get_test_examples (line 279) | def get_test_examples(self, data_dir):
method get_labels (line 284) | def get_labels(self):
method _create_examples (line 295) | def _create_examples(self, lines, set_type):
function convert_single_example (line 311) | def convert_single_example(ex_index, example, label_list, max_seq_length,
function file_based_convert_examples_to_features (line 420) | def file_based_convert_examples_to_features(
function file_based_input_fn_builder (line 454) | def file_based_input_fn_builder(input_file, seq_length, is_training,
function _truncate_seq_pair (line 502) | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
function create_model_original (line 519) | def create_model_original(bert_config, is_training, input_ids, input_mas...
function focal_loss (line 563) | def focal_loss(prediction_tensor, target_tensor, alpha=FLAGS.alpha, gamm...
function create_model (line 594) | def create_model(bert_config, is_training, input_ids, input_mask, segmen...
function model_fn_builder (line 632) | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_...
function input_fn_builder (line 738) | def input_fn_builder(features, seq_length, is_training, drop_remainder):
function convert_examples_to_features (line 792) | def convert_examples_to_features(examples, label_list, max_seq_length,
function main (line 808) | def main(_):
FILE: utils/ckpt2pb.py
function create_model (line 13) | def create_model(bert_config, input_ids, input_mask, segment_ids,
function convert (line 57) | def convert(task, tagDir, originDir, convertDir, model_type, bert_dir):
FILE: utils/evaluate.py
class Judger (line 6) | class Judger:
method __init__ (line 8) | def __init__(self, tag_path):
method format_result (line 19) | def format_result(result):
method gen_new_result (line 30) | def gen_new_result(self, result, truth, label):
method get_value (line 59) | def get_value(res):
method gen_score (line 77) | def gen_score(self, arr):
method test (line 97) | def test(self, truth_label, pre_label):
function evaluate (line 108) | def evaluate(predict_labels, target_labels, tag_dir):
FILE: utils/models.py
class RCNN (line 3) | class RCNN():
method __init__ (line 4) | def __init__(self, embedding, context_dim, hidden_dim, dropout_keep_pr...
method getLogits (line 40) | def getLogits(self):
class RCNNATT (line 43) | class RCNNATT():
method __init__ (line 44) | def __init__(self, embedding, context_dim, hidden_dim, dropout_keep_pr...
method getLogits (line 91) | def getLogits(self):
FILE: utils/predict.py
class InputFeatures (line 14) | class InputFeatures(object):
method __init__ (line 16) | def __init__(self,
function convert_single_example (line 28) | def convert_single_example(sent, label_list, max_seq_length,
class BERTModel (line 74) | class BERTModel:
method __init__ (line 76) | def __init__(self, task, pb_model, tagDir, threshold, vocab_file):
method convert (line 122) | def convert(self, line):
method getAllResult (line 130) | def getAllResult(self, sentences):
method rematch (line 142) | def rematch(self, arrays):
method predict (line 153) | def predict(self, sentences):
method getProb (line 189) | def getProb(self, sentences):
method getProbs (line 208) | def getProbs(self, sentences):
Copy disabled (too large)
Download .json
Condensed preview — 47 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (19,154K chars).
[
{
"path": ".idea/Cail2019_track2.iml",
"chars": 398,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n <component name=\"NewModuleRootManager"
},
{
"path": ".idea/misc.xml",
"chars": 292,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"JavaScriptSettings\">\n <option name=\"l"
},
{
"path": ".idea/modules.xml",
"chars": 282,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"ProjectModuleManager\">\n <modules>\n "
},
{
"path": ".idea/workspace.xml",
"chars": 24020,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"ChangeListManager\">\n <list default=\"t"
},
{
"path": "README.md",
"chars": 3540,
"preview": "# cail2019_track2\n中国法研杯CAIL2019要素抽取任务第三名方案分享\n====\n欢迎大家使用[tensorflow1.x的bert系列模型库,支持单机多卡,梯度累积,自动导出pb部署](https://github.co"
},
{
"path": "bert/.gitignore",
"chars": 1361,
"preview": "# Initially taken from Github's Python gitignore file\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$"
},
{
"path": "bert/CONTRIBUTING.md",
"chars": 1323,
"preview": "# How to Contribute\n\nBERT needs to maintain permanent compatibility with the pre-trained model files,\nso we do not plan "
},
{
"path": "bert/LICENSE",
"chars": 11358,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "bert/README.md",
"chars": 42755,
"preview": "# BERT\n\n**\\*\\*\\*\\*\\* New February 7th, 2019: TfHub Module \\*\\*\\*\\*\\***\n\nBERT has been uploaded to [TensorFlow Hub](https"
},
{
"path": "bert/__init__.py",
"chars": 616,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/create_pretraining_data.py",
"chars": 15247,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/extract_features.py",
"chars": 13918,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/modeling.py",
"chars": 37922,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/modeling_test.py",
"chars": 9201,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/multilingual.md",
"chars": 11297,
"preview": "## Models\n\nThere are two multilingual models currently available. We do not plan to release\nmore single-language models,"
},
{
"path": "bert/optimization.py",
"chars": 6258,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/optimization_test.py",
"chars": 1731,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb",
"chars": 66478,
"preview": "{\n \"nbformat\": 4,\n \"nbformat_minor\": 0,\n \"metadata\": {\n \"colab\": {\n \"name\": \"Predicting Movie Reviews with BE"
},
{
"path": "bert/requirements.txt",
"chars": 110,
"preview": "tensorflow >= 1.11.0 # CPU Version of TensorFlow.\n# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.\n"
},
{
"path": "bert/run_classifier.py",
"chars": 34813,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/run_classifier_with_tfhub.py",
"chars": 11456,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/run_pretraining.py",
"chars": 18687,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/run_squad.py",
"chars": 46562,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/sample_text.txt",
"chars": 4364,
"preview": "This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত\nText should be one-sentence-per-line, wi"
},
{
"path": "bert/tokenization.py",
"chars": 12257,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "bert/tokenization_test.py",
"chars": 4599,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "convert.py",
"chars": 627,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 10:12\nfrom utils.ckpt2pb import conve"
},
{
"path": "createPretrainData.py",
"chars": 15831,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "data/divorce/data_small_selected.json",
"chars": 992248,
"preview": "[{\"labels\": [], \"sentence\": \"原告林某某诉称:我与被告经人介绍建立恋爱关系,于1995年在菏泽市民政局办理结婚登记手续。\"}, {\"labels\": [\"DV1\"], \"sentence\": \"1998年2月份生"
},
{
"path": "data/divorce/tags.txt",
"chars": 91,
"preview": "DV1\nDV2\nDV3\nDV4\nDV5\nDV6\nDV7\nDV8\nDV9\nDV10\nDV11\nDV12\nDV13\nDV14\nDV15\nDV16\nDV17\nDV18\nDV19\nDV20\n"
},
{
"path": "data/divorce/train_selected.json",
"chars": 2134035,
"preview": "[{\"labels\": [], \"sentence\": \"原告张某某诉称,原、被告于2012年初经人介绍相识,于2012年登记结婚,婚后感情一般。\"}, {\"labels\": [\"DV5\", \"DV10\", \"DV3\"], \"sentenc"
},
{
"path": "data/labor/data_small_selected.json",
"chars": 516732,
"preview": "[{\"sentence\": \"被告快手公司答辩称:案涉争议已经由劳动争议仲裁部门处理且作出了公正裁决,原告一而再再而三提起无理诉讼,明显浪费司法资源。\", \"labels\": []}, {\"sentence\": \"一、原告提起本案诉讼属于重"
},
{
"path": "data/labor/tags.txt",
"chars": 91,
"preview": "LB1\nLB2\nLB3\nLB4\nLB5\nLB6\nLB7\nLB8\nLB9\nLB10\nLB11\nLB12\nLB13\nLB14\nLB15\nLB16\nLB17\nLB18\nLB19\nLB20\n"
},
{
"path": "data/labor/train_selected.json",
"chars": 2378389,
"preview": "[{\"sentence\": \"原告向本院提出诉讼请求:1.依法解除原告与被告的劳动关系;\", \"labels\": [\"LB1\"]}, {\"sentence\": \"2.判令被告宁乡县公共汽车公司支付原告工资4500元;\", \"labels\":"
},
{
"path": "data/loan/data_small_selected.json",
"chars": 560503,
"preview": "[{\"sentence\": \"一、农信社向东方资产公司、东方资产公司向大连德泰控股有限公司、大连德泰控股有限公司向原告转让不良资产包中是否包括案涉债权本息。\", \"labels\": [\"LN1\"]}, {\"sentence\": \"二被上诉人"
},
{
"path": "data/loan/tags.txt",
"chars": 91,
"preview": "LN1\nLN2\nLN3\nLN4\nLN5\nLN6\nLN7\nLN8\nLN9\nLN10\nLN11\nLN12\nLN13\nLN14\nLN15\nLN16\nLN17\nLN18\nLN19\nLN20\n"
},
{
"path": "data/loan/train_selected.json",
"chars": 1953632,
"preview": "[{\"sentence\": \"瑞华公司向本院提出诉讼请求:\", \"labels\": []}, {\"sentence\": \"本院认为,工行永安支行分别与京朋公司、董雪琴、林永福签订的《国内保理业务合同》、《最高额抵押合同》、《最高额保证合同》"
},
{
"path": "evaluation.py",
"chars": 2223,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 10:28\nimport json\nfrom utils.evaluate"
},
{
"path": "genPretrainData.py",
"chars": 1053,
"preview": "import json\n\ndef getSentences(filename):\n sentences = []\n with open(filename, \"r\", encoding=\"utf-8\") as f:\n "
},
{
"path": "run_pretrain.py",
"chars": 19178,
"preview": "# coding=utf-8\n# Copyright 2018 The Google AI Language Team Authors.\n#\n# Licensed under the Apache License, Version 2.0 "
},
{
"path": "search_threshold.py",
"chars": 3959,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 10:55\n\nfrom utils.predict import BERT"
},
{
"path": "train.py",
"chars": 39141,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 8:54\n\n# coding=utf-8\n# Copyright 2018"
},
{
"path": "utils/__init__.py",
"chars": 87,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 8:51"
},
{
"path": "utils/ckpt2pb.py",
"chars": 3737,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 9:58\n\nfrom bert import modeling\nimpor"
},
{
"path": "utils/evaluate.py",
"chars": 3299,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 10:28\n\nclass Judger:\n # Initialize"
},
{
"path": "utils/models.py",
"chars": 4449,
"preview": "import tensorflow as tf\n\nclass RCNN():\n def __init__(self, embedding, context_dim, hidden_dim, dropout_keep_prob):\n "
},
{
"path": "utils/predict.py",
"chars": 7409,
"preview": "#!/usr/bin/env python\n#-*- coding:utf-8 -*-\n# author:huanghui\n# datetime:2019/9/30 9:42\n\nimport tensorflow as tf\nfrom be"
}
]
About this extraction
This page contains the full source code of the huanghuidmml/cail2019_track2 GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 47 files (8.6 MB), approximately 2.3M tokens, and a symbol index with 271 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.