Repository: XuhaoWan/DMCP Branch: main Commit: 2572a3b252c2 Files: 24 Total size: 78.3 KB Directory structure: gitextract_ekw9y8z4/ ├── .gitignore ├── DMCP.py ├── LICENSE ├── MANIFEST.in ├── Models/ │ ├── ENR.py │ ├── ETR.py │ ├── FNN.py │ ├── GBR.py │ ├── GPR.py │ ├── KNR.py │ ├── KRR.py │ ├── Lasso.py │ ├── MLP.py │ ├── RFR.py │ └── SVR.py ├── README.md ├── Visualization/ │ ├── Violin.py │ ├── bar.py │ ├── pearson.py │ ├── pie.py │ └── scatter.py ├── manual ├── requirements.txt └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # https://github.com/github/gitignore/blob/main/Python.gitignore # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: DMCP.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. from Models.RFR import RFR from Models.KRR import KRR from Models.GBR import GBR from Models.KNR import KNR from Models.FNN import FNN from Models.SVR import SVR from Models.Lasso import LSO from Models.ENR import ENR from Models.GPR import GPR from Models.ETR import ETR from Models.MLP import MLP from Visualization.Violin import plot_Violin from Visualization.bar import plot_bar from Visualization.scatter import plot_scatter from Visualization.pearson import plot_pearson from Visualization.pie import plot_pie import os import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer import f90nml from multidict import CIMultiDict from statistics import mean input_file = f90nml.read('DMCP_input_file') data = CIMultiDict(input_file['data']) general = CIMultiDict(input_file['general']) visualization = CIMultiDict(input_file['visualization']) def parse_data(): # intrn if 'intrn' not in data.keys(): print('No train data file') else: data_file = data['intrn'] data_train = np.loadtxt(data_file, delimiter=",", dtype="float") # grept if 'grept' not in general.keys(): iteration = 1 else: iteration = general['grept'] train_set_RMSE = {} train_set_R2 = {} test_set_RMSE = {} test_set_R2 = {} estimator_dict = {} for i in range(iteration): x = preprocessing_data(data_train) x = add_noise(x) y = data_train[..., -1] ##psplt if 'psplt' not in general.keys(): test_size = 0.2 else: test_size = 1 - general['psplt'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=16) # model_load # gmodl if 'gmodl' not in general: print("Not choose model") else: model_list = general['gmodl'] if type(model_list) is str: model_list = [model_list] for model in model_list: param = get_params(model) model_obj = eval(model) ML_model = model_obj() if 'modpr' in general: if general['modpr'] == 'ON': ML_model.auto_tune_params(x_train, y_train) ML_model.modify_params(param) ML_model.build_model() # gcrva if 'gcrva' in general.keys(): if general['gcrva'] == 'ON': train_rmse, train_r2, test_rmse, test_r2, estimator = ML_model.model_evaluate(x, y, general[ 'gcvrn']) if i == 0: train_set_RMSE[model] = [train_rmse] train_set_R2[model] = [train_r2] test_set_RMSE[model] = [test_rmse] test_set_R2[model] = [test_r2] estimator_dict[model] = [estimator] else: train_set_RMSE[model].append(train_rmse) train_set_R2[model].append(train_r2) test_set_RMSE[model].append(test_rmse) test_set_R2[model].append(test_r2) estimator_dict[model].append(estimator) # ML_model.calculate(x_train, x_test, y_train, y_test) result_visualize(train_set_RMSE, train_set_R2, test_set_RMSE, test_set_R2) optimal_model, optimal_model_name = choose_optimal_model(train_set_RMSE, estimator_dict) predict(optimal_model, optimal_model_name) def predict(optimal_model, optimal_model_name): if 'intrn' not in data.keys(): print('No train data file') else: data_file = data['intrn'] data_train = np.loadtxt(data_file, delimiter=",", dtype="float") x = preprocessing_data(data_train) x = add_noise(x) y = data_train[..., -1] ##psplt if 'psplt' not in general.keys(): test_size = 0.2 else: test_size = 1 - general['psplt'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=16) y_train_true, y_train_pred = y_train, optimal_model.predict(x_train) y_test_true, y_test_pred = y_test, optimal_model.predict(x_test) plot_scatter(y_train_true, y_train_pred, y_test_true, y_test_pred) plot_pearson(optimal_model_name, x_train) if optimal_model_name in ['GBR', 'RFR', 'ETR']: plot_pie(optimal_model_name, optimal_model.feature_importances_) def choose_optimal_model(model_evaluate_data, estimator_dict): if 'fmodl' in visualization: optimal_model_name = visualization['fmodl'] else: model_list = [] data_list = [] for key in model_evaluate_data.keys(): model_list.append(key) data_list.append(mean(model_evaluate_data[key])) optimal_model_name = model_list[data_list.index(min(data_list))] optmal_index = model_evaluate_data[optimal_model_name].index(min(model_evaluate_data[optimal_model_name])) optimal_model = estimator_dict[optimal_model_name][optmal_index][0] return optimal_model, optimal_model_name # def parse_DMCP_input_file(): # input_file = f90nml.read('DMCP_input_file') # return input_file def result_visualize(train_set_RMSE, train_set_R2, test_set_RMSE, test_set_R2): if 'vvoln' in visualization.keys(): if visualization['vvoln'] == 'ON': plot_Violin('RMSE', test_set_RMSE) plot_Violin('R2', test_set_R2) if 'vcomp' in visualization.keys(): if visualization['vcomp'] == 'ON': plot_bar('DMCP', train_set_RMSE, train_set_R2, test_set_RMSE, test_set_R2) def preprocessing_data(data_train): # pscal if 'pscal' not in general.keys(): X = data_train[..., 0:(data_train.shape[1] - 1)] else: if general['pscal'] == 'OFF': X = data_train[..., 0:(data_train.shape[1] - 1)] elif general['pscal'] == 'NOR': scaler = MinMaxScaler() X = scaler.fit_transform(data_train[..., 0:(data_train.shape[1] - 1)]) elif general['pscal'] == 'STA': scaler = StandardScaler() X = scaler.fit_transform(data_train[..., 0:(data_train.shape[1] - 1)]) else: scaler = Normalizer(norm='l2') X = scaler.fit_transform(data_train[..., 0:(data_train.shape[1] - 1)]) return X def add_noise(X): # pnose if 'pnose' not in general.keys(): X = X else: scale = general['pnose'] x_noise = np.random.normal(loc=0.0, scale=scale, size=X.shape) X = X + x_noise return X def get_params(model): if ('PR' + model) not in general.keys(): param = {} print("Not set" + 'PR' + model) else: param = general['PR' + model] param_key = param[1:(len(param) - 1):3] param_val = param[3:(len(param) - 1):3] param = dict(zip(param_key, param_val)) return param def main(): parse_data() if __name__ == "__main__": main() ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. ================================================ FILE: MANIFEST.in ================================================ include requirements.txt ================================================ FILE: Models/ENR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.linear_model import ElasticNet as enr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class ENR(object): def __init__(self): self.params_defualt = {'alpha': 1.0, 'fit_intercept': True, 'l1_ratio': 0.5, 'normalize': False, 'precompute': False, 'max_iter': 1000, 'tol': 1e-4, 'warm_start': False, 'positive': False, 'selection': 'cyclic', 'random_state': 2} self.tuned_parameters = {'alpha': [1.0] , 'fit_intercept': [True], 'l1_ratio':[0.5], 'normalize': [False], 'precompute': [False], 'max_iter': [1000], 'tol': [1e-4], 'warm_start': [False], 'positive': [False], 'selection': ['cyclic'], 'random_state': [2]} def auto_tune_params(self, x_train, y_train): #use RMSE as the scoring clf = GridSearchCV( enr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = enr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/ETR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.tree import ExtraTreeRegressor as etr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class ETR(object): def __init__(self): self.params_defualt = {'criterion': 'mse', 'splitter': 'random', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'random_state': 16, 'min_impurity_decrease': 0, 'max_leaf_nodes': None, 'ccp_alpha': 0.0} self.tuned_parameters = {'criterion': ['mse'], 'splitter': ['random'], 'max_depth': [None], 'min_samples_split': [2], 'min_samples_leaf': [1], 'min_weight_fraction_leaf': [0.0], 'max_features': ['auto'], 'random_state': [16], 'min_impurity_decrease': [0], 'max_leaf_nodes': [None], 'ccp_alpha': [0.0]} def auto_tune_params(self, x_train, y_train): # use RMSE as the scoring clf = GridSearchCV( etr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = etr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) print(self.model.feature_importances_) return r2 ================================================ FILE: Models/FNN.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import torch from torch import nn import torch.nn.functional as F import torch.utils.data as Data from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import numpy as np from torchvision import datasets, transforms from torch.nn import init from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score class FNN(object): def __init__(self, x_train, x_test, y_train, y_test, params, model_evaluation, x, y): self.x = x self.y = y self.x_train = torch.from_numpy(x_train).type(torch.FloatTensor) y_train = torch.from_numpy(y_train).type(torch.FloatTensor) self.x_test = torch.from_numpy(x_test).type(torch.FloatTensor) y_test = torch.from_numpy(y_test).type(torch.FloatTensor) self.y_test = torch.unsqueeze(y_test, 1) self.y_train = torch.unsqueeze(y_train, 1) self.params_defualt = {'BATCH_SIZE': 32, 'LR': 0.05, 'EPOCH': 50} self.params_modify = params self.modify_params() torch_data = Data.TensorDataset(self.x_train, self.y_train) self.loader = Data.DataLoader(dataset=torch_data, batch_size=self.params_defualt['BATCH_SIZE'], shuffle=True) self.calculate() def modify_params(self): for key in self.params_modify: self.params_defualt[key] = self.params_modify[key] def calculate(self): adam_net = Net() opt_adam = torch.optim.Adam(adam_net.parameters(), lr=self.params_defualt['LR']) loss_func = nn.MSELoss() all_loss = {} for epoch in range(self.params_defualt['EPOCH']): print('epoch', epoch) for step, (b_x, b_y) in enumerate(self.loader): print('step', step) pre = adam_net(b_x) loss = loss_func(pre, b_y) opt_adam.zero_grad() loss.backward() opt_adam.step() all_loss[epoch + 1] = loss print(all_loss) yt = self.y_train.numpy() yp = adam_net(self.x_train) yp = yp.detach().numpy() rmse = np.sqrt(mse(yt, yp)) r2 = r2_score(yt, yp) yt1 = self.y_test.numpy() yp1 = adam_net(self.x_test) yp1 = yp1.detach().numpy() rmset = np.sqrt(mse(yt1, yp1)) r2t = r2_score(yt1, yp1) print(rmse) print(r2) print(rmset) print(r2t) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.hidden = nn.Linear(20, 32) self.predict = nn.Linear(32, 1) def forward(self, x): x = F.relu(self.hidden(x)) x = self.predict(x) return x def weights_init(m): if isinstance(m, nn.Linear): init.kaiming_normal(m.weight.data) ================================================ FILE: Models/GBR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.ensemble import GradientBoostingRegressor as gbr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV from sklearn.preprocessing import MinMaxScaler class GBR(object): def __init__(self): self.params_defualt = {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 5, 'learning_rate': 0.005, 'loss': 'huber'} self.tuned_parameters ={'n_estimators': [500], 'max_depth': [5], 'min_samples_split': [5], 'learning_rate': [0.005, 0.01], 'loss': ['huber']} def auto_tune_params(self, x_train, y_train): #use RMSE as the scoring clf = GridSearchCV( gbr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = gbr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(),-scores['test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] #scores1 = cross_val_score(self.model, x, y, cv=cv, scoring='neg_root_mean_squared_error') #return scores1,scores1 def calculate(self, x_train, x_test, y_train, y_test): return self.model.feature_importances_ ================================================ FILE: Models/GPR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.gaussian_process import GaussianProcessRegressor as gpr from sklearn.gaussian_process.kernels import RBF, ConstantKernel from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class GPR(object): def __init__(self): self.params_defualt = {} self.tuned_parameters = {} def auto_tune_params(self, x_train, y_train): # use RMSE as the scoring clf = GridSearchCV( gpr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = gpr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/KNR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.neighbors import KNeighborsRegressor as knr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class KNR(object): def __init__(self): self.params_defualt = {'n_neighbors': 4} self.tuned_parameters = {'n_neighbors': [4,5]} def auto_tune_params(self, x_train, y_train): # use RMSE as the scoring clf = GridSearchCV( knr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = knr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/KRR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.kernel_ridge import KernelRidge as krr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class KRR(object): def __init__(self): self.params_defualt = {'alpha': 1, 'kernel': 'linear', 'gamma': 0, 'degree': 3, 'coef0': '1'} self.tuned_parameters = {'alpha': [1,2], 'kernel': ['linear'], 'gamma': [0,0.1], 'degree': [3,4], 'coef0': ['1']} def auto_tune_params(self, x_train, y_train): #use RMSE as the scoring clf = GridSearchCV( krr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = krr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/Lasso.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.linear_model import Lasso as lso from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class LSO(object): def __init__(self): self.params_defualt = {'alpha': 1.0, 'fit_intercept': True, 'normalize': False, 'precompute': False, 'max_iter': 1000, 'tol': 1e-4, 'warm_start': False, 'positive': False, 'selection': 'cyclic', 'random_state': 8} self.tuned_parameters = {'alpha': [1.0], 'fit_intercept': [True], 'normalize': [False], 'precompute': [False], 'max_iter': [1000], 'tol': [1e-4], 'warm_start': [False], 'positive': [False], 'selection': ['cyclic'], 'random_state': [8]} def auto_tune_params(self, x_train, y_train): # use RMSE as the scoring clf = GridSearchCV( lso(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = lso(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/MLP.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.neural_network import MLPRegressor as mlp from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split, cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class MLP(object): def __init__(self): self.params_defualt = {'learning_rate': 'constant', 'learning_rate_init': 0.001, 'batch_size': 4, 'hidden_layer_sizes': (20, 32), 'random_state': 1, 'max_iter': 100000, 'activation': 'logistic'} self.tuned_parameters = {'learning_rate':['constant'], 'learning_rate_init':[0.001], 'batch_size':[4], 'hidden_layer_sizes':[(20, 32)], 'random_state':[1], 'max_iter':[100000], 'activation':['logistic']} def auto_tune_params(self, x_train, y_train): #use RMSE as the scoring clf = GridSearchCV( mlp(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = mlp(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: Models/RFR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class RFR(object): def __init__(self): self.params_defualt = {'n_estimators': 500, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_leaf_nodes': None, 'warm_start': False, 'verbose': 0, 'ccp_alpha': 0.0, 'max_samples': None} self.tuned_parameters = {'n_estimators': [500], #'criterion': [mse], 'max_depth': [None], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_features' : ['auto'], 'max_leaf_nodes': [None], 'warm_start': [False], 'verbose': [0], 'ccp_alpha': [0.0], 'max_samples': [None]} def auto_tune_params(self, x_train, y_train): # use RMSE as the scoring clf = GridSearchCV( rfr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = rfr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) print(self.model.feature_importances_) return r2 ================================================ FILE: Models/SVR.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.svm import SVR as svr from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV from sklearn.preprocessing import MinMaxScaler class SVR(object): def __init__(self): self.params_defualt = {} self.tuned_parameters = {} def auto_tune_params(self, x_train, y_train): #use RMSE as the scoring clf = GridSearchCV( svr(), self.tuned_parameters, scoring='neg_root_mean_squared_error' ) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) self.params_defualt = clf.best_params_ def modify_params(self, params): for key in params: self.params_defualt[key] = params[key] def build_model(self): self.model = svr(**self.params_defualt) def model_evaluate(self, x, y, cv): scoring = ['neg_root_mean_squared_error', 'r2'] scores = cross_validate(self.model, x, y, scoring=scoring, cv=cv, return_train_score=True, return_estimator=True) self.estimator = scores['estimator'] return -scores['train_neg_root_mean_squared_error'].mean(), scores['train_r2'].mean(), -scores[ 'test_neg_root_mean_squared_error'].mean(), scores['test_r2'].mean(), scores['estimator'] def calculate(self, x_train, x_test, y_train, y_test): self.model.fit(x_train, y_train) rmse = np.sqrt(mse(y_train, self.model.predict(x_train))) r2 = r2_score(y_train, self.model.predict(x_train)) rmset = np.sqrt(mse(y_test, self.model.predict(x_test))) r2t = r2_score(y_test, self.model.predict(x_test)) print('pre:', self.model.predict(x_test)) print(y_test) print(rmse) print(r2) print(rmset) print(r2t) return r2 ================================================ FILE: README.md ================================================ # DMCP:DFT-based Machine learning method for Captureing Property Relationship with Structures DMCP is aimed to implement DFT-based and Machine-learning-accelerated (DFT-ML) scheme for captureing QSPR in intricate system . It is possible to predict the property of intricate system such as HEAs and to reveal the intrinsic descriptors which determine the underlying property of them with appropriate algorithm and train data features. # Developer: DMCP is developed within Prof. Yuzheng Guo's group in Wuhan University, in colloboration with Prof. John Robertson's group in Cambridge University. Core developer: Xuhao Wan, Yuzheng Guo Email: xhwanrm@whu.edu.cn, yguo@whu.edu.cn # Major Features 1. Ten machine learning algorithms: GBR, KNR, SVR, GPR, FNN, RFR, ETR, KRR, LASSO, and ENR. 2. Multiple methods to improve model accuracy: dataset split, cross validation, repeated trails. 3. Visualization module for research. # Prerequisites 1. Generally, you need some data obtained from DFT calculations such as VASP, QE, and CP2K or available material database. 2. DMCP requires Python 3 with the packages specified in requirements.txt. This is taken care of by pip. # Citation If you use DMCP in your research, please cite the following paper: 1. X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. doi.org/10.1016/j.matre.2021.100046. # Reference The work applied DMCP are listed as following: 1. Dou B, Zhu Z, Merkurjev E, et al. Machine learning methods for small data challenges in molecular science. Chemical Reviews, 2023, 123(13): 8736-8780. 2. Tamtaji M, Gao H, Hossain M D, et al. Machine learning for design principles for single atom catalysts towards electrochemical reactions. Journal of Materials Chemistry A, 2022, 10(29): 15309-15331. 3. Liu X, Zhang Y, Wang W, et al. Transition metal and N doping on AlP monolayers for bifunctional oxygen electrocatalysts: density functional theory study assisted by machine learning description. ACS Applied Materials & Interfaces, 2021, 14(1): 1249-1259. 4. Huang Y, Rehman F, Tamtaji M, et al. Mechanistic understanding and design of non-noble metal-based single-atom catalysts supported on two-dimensional materials for CO 2 electroreduction. Journal of Materials Chemistry A, 2022, 10(11): 5813-5834. 5. Liu T, Zhao X, Liu X, et al. Understanding the hydrogen evolution reaction activity of doped single-atom catalysts on two-dimensional GaPS4 by DFT and machine learning. Journal of Energy Chemistry, 2023, 81: 93-100. 6. X. Wan, Z. Zhang*, H. Niu, Y. Yin, C. Kuai, J. Wang, C. Shao, Y. Guo*, Machine-Learning-Accelerated Catalytic Activity Predictions of Transition Metal Phthalocyanine Dual-Metal-Sites Catalysts for CO2 Reduction. The Journal of Physical Chemistry Letters, 2021. 7. H. Niu#, X. Wan#, X. Wang, C. Chen, J. Robertson, Z. Zhang*, Y. Guo*, Single-Atom Rhodium on Defective g-C3N4: A Promising Bifunctional Oxygen Electrocatalyst. ACS Sustainable Chem. Eng., 9(9), 3590-3599, 2021. 8. Wan X, Yu W, Niu H, ea al. Revealing the Oxygen Reduction/Evoluti on Reaction Activity Origin of Carbon-Nitride-Related Single-Atom catalysts: Quantum Chemistry in Artificial Intelligence. Chemical Engineering Journal. 2022,440:135946. 16. Khrabrov K, Shenbin I, Ryabov A, et al. nablaDFT: Large-Scale Conformational Energy and Hamiltonian Prediction benchmark and dataset. Physical Chemistry Chemical Physics, 2022, 24(42): 25853-25863. 17. Pant D, Pokharel S, Mandal S, et al. DFT-aided machine learning-based discovery of magnetism in Fe-based bimetallic chalcogenides. Scientific Reports, 2023, 13(1): 3277. # Tips Welcome to join the DMCP exchange Wechat group. 欢迎加入DMCP微信交流群。 ![9eb7685d4afb615aaf0f70843d8895c](https://user-images.githubusercontent.com/73831094/146893430-46b61a00-f54d-423f-98b6-a8413216c8d5.jpg) If it is invalid, please contact us by Email. ================================================ FILE: Visualization/Violin.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt class plot_Violin(object): def __init__(self, title, data_dict): model_list = [] data_list = [] #for key in data_dict.keys(): #model_list.append(key) #data_list.append(data_dict[key]) tips = pd.DataFrame.from_dict(data_dict) #tips = np.array(data_list).T sns.set(style='ticks', font='Times New Roman', font_scale=1.8) fig = plt.figure(figsize=(9, 6)) ax=sns.violinplot(data=tips, split=True, linewidth = 2, #线宽 width = 0.8, #箱之间的间隔比例 palette = 'pastel', #设置调色板 #order = model_list, #筛选类别 # scale = 'count', #测度小提琴图的宽度: area-面积相同,count-按照样本数量决定宽度,width-宽度一样 gridsize = 50, #设置小提琴图的平滑度,越高越平滑 # inner = 'box', #设置内部显示类型 --> 'box','quartile','point','stick',None #bw = 0.8 #控制拟合程度,一般可以不设置 ) ax.set_ylabel(title + 'Score', fontsize=28, fontfamily='Times New Roman') ax.set_xlabel('model', fontsize=28, fontfamily='Times New Roman') #ax = fig.add_subplot(111) fig.savefig('volin_' + title + '.jpg', bbox_inches='tight') plt.show() #data_dict = {'GBR': [-0.32744250093837, -0.3958306749566164], 'KNR': [-0.3522280594983168, -0.3593963196775159], 'SVR': [-0.34374401871141413, -0.34763844616449313], 'GPR': [-0.3802567562102421, -0.3864921408452695], 'MLP': [-0.3912175478421521, -0.39099797659443214], 'RFR': [-0.34453840142807285, -0.3616486858237452], 'ETR': [-0.41477900933292655, -0.49834959923665945], 'KRR': [-0.4050557476038392, -0.41178328305680595], 'LSO': [-0.3890709246795953, -0.3890709246795953], 'ENR': [-0.3890709246795953, -0.3890709246795953]} #data_show = plot_Violin(data_dict) ================================================ FILE: Visualization/bar.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import pandas as pd import numpy as np import matplotlib.pyplot as plt from statistics import mean class plot_bar(object): def __init__(self, title, train_set_RMSE, train_set_R2, test_set_RMSE, test_set_R2): data_list2 = [] for item in [train_set_RMSE, train_set_R2, test_set_RMSE, test_set_R2]: data_list1 = [] model_list = [] for i in item.keys(): data_list1.append(mean(item[i])) model_list.append(i) data_list2.append(data_list1) font={'family':'Times New Roman', 'weight':'normal', 'size':20} ##取出 4个dict中的数据,注意dict中的数据存放是无序的 R2train = data_list2[1] # train set R2 score R2test = data_list2[3]# test set R2 score RMSEtrain = data_list2[0] # train set RMSE RMSEtest = data_list2[2] # test set RMSE #for item in [R2train, R2test, RMSEtrain, RMSEtest]: #for i in range(len(item)): #if item[i] < 0: #item[i] = 0.5 label = model_list bar_width = 0.4 bar_x = np.arange(len(label)) fig1 = plt.figure(figsize=(9, 6)) ax1 = fig1.add_subplot(111) #ax1.set_title('RMSE') bar1 = ax1.bar(x=bar_x - bar_width/2, # 设置不同的x起始位置 height= RMSEtrain, width=bar_width, color='royalblue') bar2 = ax1.bar(x=bar_x + bar_width/2, # 设置不同的x起始位置 height= RMSEtest, width=bar_width, color='darkorange' ) ax1.set_ylabel('RMSE /eV', fontsize=24, fontfamily='Times New Roman') ax1.set_xticks(range(len(label))) ax1.set_xticklabels(label, fontsize=20, fontfamily='Times New Roman') #ax1.set_yticklabels(np.around((np.arange(0, 0.4, 0.05)), decimals=2), fontsize=20, fontfamily='Times New Roman') ax1.legend((bar1, bar2), ('Train set', 'Test set'), prop=font) fig2 = plt.figure(figsize=(9, 6)) ax2 = fig2.add_subplot(111) #ax1.set_title('RMSE') bar1 = ax2.bar(x=bar_x - bar_width/2, # 设置不同的x起始位置 height= R2train, width=bar_width, color='royalblue') bar2 = ax2.bar(x=bar_x + bar_width/2, # 设置不同的x起始位置 height= R2test, width=bar_width, color='darkorange' ) ax2.set_ylabel('Score', fontsize=24, fontfamily='Times New Roman') ax2.set_xticks(range(len(label))) ax2.set_xticklabels(label, fontsize=20, fontfamily='Times New Roman') #ax2.set_yticklabels(np.around((np.arange(0, 1.0, 0.2)), decimals=2), fontsize=20, fontfamily='Times New Roman') ax2.legend((bar1, bar2), ('Train set', 'Test set'), prop=font) fig1.savefig('bar_RMSE.jpg') fig2.savefig('bar2_R2.jpg') plt.show() ================================================ FILE: Visualization/pearson.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np from sklearn.ensemble import GradientBoostingRegressor as GBR from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from scipy.stats import pearsonr import seaborn as sns import matplotlib.pyplot as plt class plot_pearson(object): def __init__(self, optimal_model_name,corelation): pearson_r2 = [] for i in range(corelation.shape[1]): pearson_r1 = [] for j in range(corelation.shape[1]): r, _ = pearsonr(corelation[:][i], corelation[:][j]) pearson_r1.append(r) pearson_r2.append(pearson_r1) ax1 = sns.heatmap(pearson_r2, vmin=-1, vmax=1, cmap='RdBu') plt.title(optimal_model_name) plt.savefig('Pcdac_pearson.jpg') plt.show() ================================================ FILE: Visualization/pie.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import numpy as np import matplotlib.pyplot as plt class plot_pie(object): def __init__(self, optimal_model_name, feature_importance): font={'family':'Times New Roman', 'weight':'normal', 'size': 18} cmap = plt.get_cmap("tab20") colors = cmap(np.arange(len(feature_importance))) labels = ['e_dp1', 'H_f.ox1', 'N_m1', 'χ_1', 'I_m1', 'r_1', 'N_d1', 'Q_1', 'ΔG_COOH*1', 'ΔG_Max1', 'e_dp2','H_f.ox2', 'N_m2', 'χ_2', 'I_m2', 'r_2', 'N_d2', 'Q_2', 'ΔG_COOH*2', 'ΔG_Max2'] #for i in range(1,len(feature_importance) + 1): #labels.append(str('x')+str(i)) fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) wedges, text = ax.pie(feature_importance, colors=colors, shadow=True, startangle=90, textprops=font) ax.legend(wedges, labels, bbox_to_anchor=(1, 0, 0, 1), fontsize=8) plt.title(optimal_model_name) fig.savefig('Pcdac_fipie.jpg') plt.show() ================================================ FILE: Visualization/scatter.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- #author: xuhao wan, wei yu #If you use DMCP in your research, please cite the following paper:X. Wan, Z. Zhang*, W. Yu, Y. Guo*, A State-of-the-art Density-functional-theory-based and Machine-learning-accelerated Hybrid Method for Intricate System Catalysis. Materials Reports: Energy. 2021. import pandas as pd import numpy as np import matplotlib.pyplot as plt class plot_scatter(object): def __init__(self, y_train_true, y_train_pred, y_test_true, y_test_pred): font={'family':'Times New Roman', 'weight':'normal', 'size': 24} fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) dot1 = ax.scatter(y_train_true, y_train_pred, s=80, c='white', edgecolors='royalblue', marker='o', linewidth=2) dot2 = ax.scatter(y_test_true, y_test_pred, s=80, c='white', edgecolors='darkorange', marker='s', linewidth=2) line = ax.plot([0,1,2.2], [0,1,2.2], color='k') ax.set_xlabel('$\mathregular{G_{DFT}}$ /eV', fontsize=24, fontfamily='Times New Roman') ax.set_ylabel('$\mathregular{G_{ML}}$ /eV' , fontsize=24, fontfamily='Times New Roman') ax.set_xlim(xmin=0, xmax=2.2) ax.set_ylim(ymin=0, ymax=2.2) ax.set_xticklabels(np.around((np.arange(0, 2.2, 0.25)), decimals=2), fontsize=20, fontfamily='Times New Roman') ax.set_yticklabels(np.around((np.arange(0, 2.2, 0.25)), decimals=2), fontsize=20, fontfamily='Times New Roman') ax.legend((dot1, dot2), ('Train set', 'Test set'), prop=font) fig.savefig('Pcdac_scatter.jpg', bbox_inches='tight') plt.show() ================================================ FILE: manual ================================================ The keywords INTRN and OUTDAT are the filename of the input and output data files, respectively. OTFIG is the filename prefix of the visualization results generated by DMCP and the format of these figures is optional, including jpg, png, and pdf. The keyword PSCAL controls the feature scaling: OFF, NOR, STA, and REG means no data scaling, normalization, standardization, and regularization processing, respectively. The noise processing is controlled by the keyword PNOSE, and its value determines the distribution range of noises while 0 means the noise processing is not employed. Original data is reproduced with randomly distributed noises in the scale of -x to x (x is the values of the keyword PNOSE). The keyword PSPLT controls the dataset split and its values are the percentage of the training dataset. The keyword GCRVA controls whether the cross-validation is employed (ON or OFF) while the value of the keyword GREPT is the number of repeated trials. When the cross-validation and repeated trails are applied together, the value of GREPT is the repeat times of the training procedure in each dataset split and the value of the keyword GCVRN is the number of the rounds of cross-validation. The selected algorithm is determined by the keyword GMODL: the corresponding GMODL values of are GBR (for Gradient Boosted Regression), KNR (k-Neighbor Regression), SVR (Support Vector Regression), GPR (Gaussian Process Regression), FNN (Feedforward Neural Network), RFR (Random Forest Regression), ETR (Extra Trees Regression), KRR (Kernel Ridge Regression), LASSO (Least Absolute Shrinkage and Selection Operator Regression) and ENR (Elastic Net Regression). Several algorithms can be selected at the same time to establish different machine learning models by simply enumerating the corresponding values of GMODL. The model parameters can be provided by the keyword PRX where X represents the abbreviations (also the values of GMODL) of the algorithms. The keyword VVOLN in the visualization module controls the draw of the violin plot. The keyword VCOMP is related to the histogram The keywords that control the switch of the scatter plot and the pie chart are respectively VSCAM and VFTIM and their values are the selected machine learning model which is usually the best performing model. The keyword VPRAS controls whether the Pearson correlation map is drawn. To predict the catalytic performance, the corresponding feature values should be generated and transported into the model at first, which is controlled by the keyword INPRE and its value is the filename of the input data used for prediction. And the keyword GPREM determines the model used in the prediction process which is usually the best performing model and it is also the switch of the prediction function in DMCP. ================================================ FILE: requirements.txt ================================================ #The DMCP Program can be run in either Linux or Windows operation systems utilizing command lines. #Please install the following compiler in your operation system to run DMCP. numpy>=1.20.1 scipy torch sklearn pandas matplotlib f90nml multidict statistics pytest>=4.6 ================================================ FILE: setup.py ================================================ import setuptools with open("README.md", "r") as fh: long_description = fh.read() with open("requirements.txt", "r") as fh: dependencies = fh.readlines() setuptools.setup( name="DMCP", packages=setuptools.find_packages(exclude=["tests"]), version="0.1.2", author="Xuhao Wan", author_email="xhwanrm@whu.edu.cn", description="DMCP", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/XuhaoWan/DMCP", python_requires=">=3.7", install_requires=dependencies, license="GNU", classifiers=[ "License :: OSI Approved :: GNU License", "Topic :: Scientific/Engineering :: Physics", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Development Status :: 4 - Beta", ], )