Repository: norbusan/scikit-learn Branch: master Commit: 248f6cf3156f Files: 1269 Total size: 14.3 MB Directory structure: gitextract_8esimy8a/ ├── .binder/ │ ├── postBuild │ └── requirements.txt ├── .circleci/ │ ├── artifact_path │ └── config.yml ├── .codecov.yml ├── .coveragerc ├── .git-blame-ignore-revs ├── .gitattributes ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ ├── doc_improvement.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── labeler-file-extensions.yml │ ├── labeler-module.yml │ ├── scripts/ │ │ └── label_title_regex.py │ └── workflows/ │ ├── assign.yml │ ├── check-changelog.yml │ ├── check-manifest.yml │ ├── labeler-module.yml │ ├── labeler-title-regex.yml │ ├── publish_pypi.yml │ ├── twitter.yml │ ├── unassign.yml │ └── wheels.yml ├── .gitignore ├── .mailmap ├── .pre-commit-config.yaml ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── COPYING ├── MANIFEST.in ├── Makefile ├── README.rst ├── SECURITY.md ├── asv_benchmarks/ │ ├── .gitignore │ ├── asv.conf.json │ └── benchmarks/ │ ├── __init__.py │ ├── cluster.py │ ├── common.py │ ├── config.json │ ├── datasets.py │ ├── decomposition.py │ ├── ensemble.py │ ├── linear_model.py │ ├── manifold.py │ ├── metrics.py │ ├── model_selection.py │ ├── neighbors.py │ ├── svm.py │ └── utils.py ├── azure-pipelines.yml ├── benchmarks/ │ ├── .gitignore │ ├── bench_20newsgroups.py │ ├── bench_covertype.py │ ├── bench_feature_expansions.py │ ├── bench_glm.py │ ├── bench_glmnet.py │ ├── bench_hist_gradient_boosting.py │ ├── bench_hist_gradient_boosting_adult.py │ ├── bench_hist_gradient_boosting_categorical_only.py │ ├── bench_hist_gradient_boosting_higgsboson.py │ ├── bench_hist_gradient_boosting_threading.py │ ├── bench_isolation_forest.py │ ├── bench_isotonic.py │ ├── bench_kernel_pca_solvers_time_vs_n_components.py │ ├── bench_kernel_pca_solvers_time_vs_n_samples.py │ ├── bench_lasso.py │ ├── bench_lof.py │ ├── bench_mnist.py │ ├── bench_multilabel_metrics.py │ ├── bench_online_ocsvm.py │ ├── bench_plot_fastkmeans.py │ ├── bench_plot_hierarchical.py │ ├── bench_plot_incremental_pca.py │ ├── bench_plot_lasso_path.py │ ├── bench_plot_neighbors.py │ ├── bench_plot_nmf.py │ ├── bench_plot_omp_lars.py │ ├── bench_plot_parallel_pairwise.py │ ├── bench_plot_polynomial_kernel_approximation.py │ ├── bench_plot_randomized_svd.py │ ├── bench_plot_svd.py │ ├── bench_plot_ward.py │ ├── bench_random_projections.py │ ├── bench_rcv1_logreg_convergence.py │ ├── bench_saga.py │ ├── bench_sample_without_replacement.py │ ├── bench_sgd_regression.py │ ├── bench_sparsify.py │ ├── bench_text_vectorizers.py │ ├── bench_tree.py │ ├── bench_tsne_mnist.py │ └── plot_tsne_mnist.py ├── build_tools/ │ ├── Makefile │ ├── azure/ │ │ ├── install.sh │ │ ├── install_win.sh │ │ ├── posix-docker.yml │ │ ├── posix.yml │ │ ├── test_docs.sh │ │ ├── test_docstring.sh │ │ ├── test_pytest_soft_dependency.sh │ │ ├── test_script.sh │ │ ├── upload_codecov.sh │ │ └── windows.yml │ ├── circle/ │ │ ├── build_doc.sh │ │ ├── build_test_arm.sh │ │ ├── build_test_pypy.sh │ │ ├── checkout_merge_commit.sh │ │ ├── linting.sh │ │ ├── list_versions.py │ │ └── push_doc.sh │ ├── codespell_ignore_words.txt │ ├── generate_authors_table.py │ ├── github/ │ │ ├── Windows │ │ ├── build_minimal_windows_image.sh │ │ ├── build_source.sh │ │ ├── build_wheels.sh │ │ ├── check_build_trigger.sh │ │ ├── check_wheels.py │ │ ├── repair_windows_wheels.sh │ │ ├── test_source.sh │ │ ├── test_wheels.sh │ │ ├── test_windows_wheels.sh │ │ ├── upload_anaconda.sh │ │ └── vendor.py │ ├── shared.sh │ └── travis/ │ ├── after_success.sh │ ├── install.sh │ ├── install_main.sh │ ├── install_wheels.sh │ ├── script.sh │ ├── test_docs.sh │ ├── test_script.sh │ └── test_wheels.sh ├── conftest.py ├── doc/ │ ├── Makefile │ ├── README.md │ ├── about.rst │ ├── authors.rst │ ├── authors_emeritus.rst │ ├── binder/ │ │ └── requirements.txt │ ├── common_pitfalls.rst │ ├── communication_team.rst │ ├── computing/ │ │ ├── computational_performance.rst │ │ ├── parallelism.rst │ │ └── scaling_strategies.rst │ ├── computing.rst │ ├── conf.py │ ├── conftest.py │ ├── contents.rst │ ├── data_transforms.rst │ ├── datasets/ │ │ ├── loading_other_datasets.rst │ │ ├── real_world.rst │ │ ├── sample_generators.rst │ │ └── toy_dataset.rst │ ├── datasets.rst │ ├── developers/ │ │ ├── advanced_installation.rst │ │ ├── bug_triaging.rst │ │ ├── contributing.rst │ │ ├── develop.rst │ │ ├── index.rst │ │ ├── maintainer.rst │ │ ├── performance.rst │ │ ├── plotting.rst │ │ ├── tips.rst │ │ └── utilities.rst │ ├── faq.rst │ ├── getting_started.rst │ ├── glossary.rst │ ├── governance.rst │ ├── includes/ │ │ ├── big_toc_css.rst │ │ └── bigger_toc_css.rst │ ├── inspection.rst │ ├── install.rst │ ├── make.bat │ ├── model_selection.rst │ ├── modules/ │ │ ├── biclustering.rst │ │ ├── calibration.rst │ │ ├── classes.rst │ │ ├── clustering.rst │ │ ├── compose.rst │ │ ├── covariance.rst │ │ ├── cross_decomposition.rst │ │ ├── cross_validation.rst │ │ ├── decomposition.rst │ │ ├── density.rst │ │ ├── ensemble.rst │ │ ├── feature_extraction.rst │ │ ├── feature_selection.rst │ │ ├── gaussian_process.rst │ │ ├── grid_search.rst │ │ ├── impute.rst │ │ ├── isotonic.rst │ │ ├── kernel_approximation.rst │ │ ├── kernel_ridge.rst │ │ ├── lda_qda.rst │ │ ├── learning_curve.rst │ │ ├── linear_model.rst │ │ ├── manifold.rst │ │ ├── metrics.rst │ │ ├── mixture.rst │ │ ├── model_evaluation.rst │ │ ├── model_persistence.rst │ │ ├── multiclass.rst │ │ ├── naive_bayes.rst │ │ ├── neighbors.rst │ │ ├── neural_networks_supervised.rst │ │ ├── neural_networks_unsupervised.rst │ │ ├── outlier_detection.rst │ │ ├── partial_dependence.rst │ │ ├── permutation_importance.rst │ │ ├── pipeline.rst │ │ ├── preprocessing.rst │ │ ├── preprocessing_targets.rst │ │ ├── random_projection.rst │ │ ├── semi_supervised.rst │ │ ├── sgd.rst │ │ ├── svm.rst │ │ ├── tree.rst │ │ └── unsupervised_reduction.rst │ ├── preface.rst │ ├── presentations.rst │ ├── related_projects.rst │ ├── roadmap.rst │ ├── sphinxext/ │ │ ├── MANIFEST.in │ │ ├── add_toctree_functions.py │ │ ├── custom_references_resolver.py │ │ ├── doi_role.py │ │ ├── github_link.py │ │ └── sphinx_issues.py │ ├── supervised_learning.rst │ ├── support.rst │ ├── templates/ │ │ ├── class.rst │ │ ├── class_with_call.rst │ │ ├── deprecated_class.rst │ │ ├── deprecated_class_with_call.rst │ │ ├── deprecated_class_without_init.rst │ │ ├── deprecated_function.rst │ │ ├── function.rst │ │ ├── generate_deprecated.sh │ │ ├── index.html │ │ ├── numpydoc_docstring.rst │ │ └── redirects.html │ ├── testimonials/ │ │ ├── README.txt │ │ ├── images/ │ │ │ └── Makefile │ │ └── testimonials.rst │ ├── themes/ │ │ └── scikit-learn-modern/ │ │ ├── javascript.html │ │ ├── layout.html │ │ ├── nav.html │ │ ├── search.html │ │ ├── static/ │ │ │ ├── css/ │ │ │ │ └── theme.css │ │ │ └── js/ │ │ │ └── searchtools.js │ │ └── theme.conf │ ├── triage_team.rst │ ├── tune_toc.rst │ ├── tutorial/ │ │ ├── basic/ │ │ │ └── tutorial.rst │ │ ├── common_includes/ │ │ │ └── info.txt │ │ ├── index.rst │ │ ├── machine_learning_map/ │ │ │ ├── ML_MAPS_README.txt │ │ │ ├── index.rst │ │ │ ├── parse_path.py │ │ │ ├── pyparsing.py │ │ │ └── svg2imagemap.py │ │ ├── statistical_inference/ │ │ │ ├── index.rst │ │ │ ├── model_selection.rst │ │ │ ├── putting_together.rst │ │ │ ├── settings.rst │ │ │ ├── supervised_learning.rst │ │ │ └── unsupervised_learning.rst │ │ └── text_analytics/ │ │ ├── .gitignore │ │ ├── data/ │ │ │ ├── languages/ │ │ │ │ └── fetch_data.py │ │ │ ├── movie_reviews/ │ │ │ │ └── fetch_data.py │ │ │ └── twenty_newsgroups/ │ │ │ └── fetch_data.py │ │ ├── skeletons/ │ │ │ ├── exercise_01_language_train_model.py │ │ │ └── exercise_02_sentiment.py │ │ ├── solutions/ │ │ │ ├── exercise_01_language_train_model.py │ │ │ ├── exercise_02_sentiment.py │ │ │ └── generate_skeletons.py │ │ └── working_with_text_data.rst │ ├── unsupervised_learning.rst │ ├── user_guide.rst │ ├── visualizations.rst │ ├── whats_new/ │ │ ├── _contributors.rst │ │ ├── changelog_legend.inc │ │ ├── older_versions.rst │ │ ├── v0.13.rst │ │ ├── v0.14.rst │ │ ├── v0.15.rst │ │ ├── v0.16.rst │ │ ├── v0.17.rst │ │ ├── v0.18.rst │ │ ├── v0.19.rst │ │ ├── v0.20.rst │ │ ├── v0.21.rst │ │ ├── v0.22.rst │ │ ├── v0.23.rst │ │ ├── v0.24.rst │ │ ├── v1.0.rst │ │ └── v1.1.rst │ └── whats_new.rst ├── examples/ │ ├── README.txt │ ├── applications/ │ │ ├── README.txt │ │ ├── plot_cyclical_feature_engineering.py │ │ ├── plot_digits_denoising.py │ │ ├── plot_face_recognition.py │ │ ├── plot_model_complexity_influence.py │ │ ├── plot_out_of_core_classification.py │ │ ├── plot_outlier_detection_wine.py │ │ ├── plot_prediction_latency.py │ │ ├── plot_species_distribution_modeling.py │ │ ├── plot_stock_market.py │ │ ├── plot_tomography_l1_reconstruction.py │ │ ├── plot_topics_extraction_with_nmf_lda.py │ │ ├── svm_gui.py │ │ └── wikipedia_principal_eigenvector.py │ ├── bicluster/ │ │ ├── README.txt │ │ ├── plot_bicluster_newsgroups.py │ │ ├── plot_spectral_biclustering.py │ │ └── plot_spectral_coclustering.py │ ├── calibration/ │ │ ├── README.txt │ │ ├── plot_calibration.py │ │ ├── plot_calibration_curve.py │ │ ├── plot_calibration_multiclass.py │ │ └── plot_compare_calibration.py │ ├── classification/ │ │ ├── README.txt │ │ ├── plot_classification_probability.py │ │ ├── plot_classifier_comparison.py │ │ ├── plot_digits_classification.py │ │ ├── plot_lda.py │ │ └── plot_lda_qda.py │ ├── cluster/ │ │ ├── README.txt │ │ ├── plot_adjusted_for_chance_measures.py │ │ ├── plot_affinity_propagation.py │ │ ├── plot_agglomerative_clustering.py │ │ ├── plot_agglomerative_clustering_metrics.py │ │ ├── plot_agglomerative_dendrogram.py │ │ ├── plot_birch_vs_minibatchkmeans.py │ │ ├── plot_cluster_comparison.py │ │ ├── plot_cluster_iris.py │ │ ├── plot_coin_segmentation.py │ │ ├── plot_coin_ward_segmentation.py │ │ ├── plot_color_quantization.py │ │ ├── plot_dbscan.py │ │ ├── plot_dict_face_patches.py │ │ ├── plot_digits_agglomeration.py │ │ ├── plot_digits_linkage.py │ │ ├── plot_face_compress.py │ │ ├── plot_feature_agglomeration_vs_univariate_selection.py │ │ ├── plot_inductive_clustering.py │ │ ├── plot_kmeans_assumptions.py │ │ ├── plot_kmeans_digits.py │ │ ├── plot_kmeans_plusplus.py │ │ ├── plot_kmeans_silhouette_analysis.py │ │ ├── plot_kmeans_stability_low_dim_dense.py │ │ ├── plot_linkage_comparison.py │ │ ├── plot_mean_shift.py │ │ ├── plot_mini_batch_kmeans.py │ │ ├── plot_optics.py │ │ ├── plot_segmentation_toy.py │ │ └── plot_ward_structured_vs_unstructured.py │ ├── compose/ │ │ ├── README.txt │ │ ├── plot_column_transformer.py │ │ ├── plot_column_transformer_mixed_types.py │ │ ├── plot_compare_reduction.py │ │ ├── plot_digits_pipe.py │ │ ├── plot_feature_union.py │ │ └── plot_transformed_target.py │ ├── covariance/ │ │ ├── README.txt │ │ ├── plot_covariance_estimation.py │ │ ├── plot_lw_vs_oas.py │ │ ├── plot_mahalanobis_distances.py │ │ ├── plot_robust_vs_empirical_covariance.py │ │ └── plot_sparse_cov.py │ ├── cross_decomposition/ │ │ ├── README.txt │ │ ├── plot_compare_cross_decomposition.py │ │ └── plot_pcr_vs_pls.py │ ├── datasets/ │ │ ├── README.txt │ │ ├── plot_digits_last_image.py │ │ ├── plot_iris_dataset.py │ │ ├── plot_random_dataset.py │ │ └── plot_random_multilabel_dataset.py │ ├── decomposition/ │ │ ├── README.txt │ │ ├── plot_beta_divergence.py │ │ ├── plot_faces_decomposition.py │ │ ├── plot_ica_blind_source_separation.py │ │ ├── plot_ica_vs_pca.py │ │ ├── plot_image_denoising.py │ │ ├── plot_incremental_pca.py │ │ ├── plot_kernel_pca.py │ │ ├── plot_pca_3d.py │ │ ├── plot_pca_iris.py │ │ ├── plot_pca_vs_fa_model_selection.py │ │ ├── plot_pca_vs_lda.py │ │ ├── plot_sparse_coding.py │ │ └── plot_varimax_fa.py │ ├── ensemble/ │ │ ├── README.txt │ │ ├── plot_adaboost_hastie_10_2.py │ │ ├── plot_adaboost_multiclass.py │ │ ├── plot_adaboost_regression.py │ │ ├── plot_adaboost_twoclass.py │ │ ├── plot_bias_variance.py │ │ ├── plot_ensemble_oob.py │ │ ├── plot_feature_transformation.py │ │ ├── plot_forest_importances.py │ │ ├── plot_forest_importances_faces.py │ │ ├── plot_forest_iris.py │ │ ├── plot_gradient_boosting_categorical.py │ │ ├── plot_gradient_boosting_early_stopping.py │ │ ├── plot_gradient_boosting_oob.py │ │ ├── plot_gradient_boosting_quantile.py │ │ ├── plot_gradient_boosting_regression.py │ │ ├── plot_gradient_boosting_regularization.py │ │ ├── plot_isolation_forest.py │ │ ├── plot_monotonic_constraints.py │ │ ├── plot_random_forest_embedding.py │ │ ├── plot_random_forest_regression_multioutput.py │ │ ├── plot_stack_predictors.py │ │ ├── plot_voting_decision_regions.py │ │ ├── plot_voting_probas.py │ │ └── plot_voting_regressor.py │ ├── exercises/ │ │ ├── README.txt │ │ ├── plot_cv_diabetes.py │ │ ├── plot_cv_digits.py │ │ ├── plot_digits_classification_exercise.py │ │ └── plot_iris_exercise.py │ ├── feature_selection/ │ │ ├── README.txt │ │ ├── plot_f_test_vs_mi.py │ │ ├── plot_feature_selection.py │ │ ├── plot_feature_selection_pipeline.py │ │ ├── plot_rfe_digits.py │ │ ├── plot_rfe_with_cross_validation.py │ │ └── plot_select_from_model_diabetes.py │ ├── gaussian_process/ │ │ ├── README.txt │ │ ├── plot_compare_gpr_krr.py │ │ ├── plot_gpc.py │ │ ├── plot_gpc_iris.py │ │ ├── plot_gpc_isoprobability.py │ │ ├── plot_gpc_xor.py │ │ ├── plot_gpr_co2.py │ │ ├── plot_gpr_noisy.py │ │ ├── plot_gpr_noisy_targets.py │ │ ├── plot_gpr_on_structured_data.py │ │ └── plot_gpr_prior_posterior.py │ ├── impute/ │ │ ├── README.txt │ │ ├── plot_iterative_imputer_variants_comparison.py │ │ └── plot_missing_values.py │ ├── inspection/ │ │ ├── README.txt │ │ ├── plot_linear_model_coefficient_interpretation.py │ │ ├── plot_partial_dependence.py │ │ ├── plot_permutation_importance.py │ │ └── plot_permutation_importance_multicollinear.py │ ├── kernel_approximation/ │ │ ├── README.txt │ │ └── plot_scalable_poly_kernels.py │ ├── linear_model/ │ │ ├── README.txt │ │ ├── plot_ard.py │ │ ├── plot_bayesian_ridge.py │ │ ├── plot_bayesian_ridge_curvefit.py │ │ ├── plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py │ │ ├── plot_huber_vs_ridge.py │ │ ├── plot_iris_logistic.py │ │ ├── plot_lasso_and_elasticnet.py │ │ ├── plot_lasso_coordinate_descent_path.py │ │ ├── plot_lasso_dense_vs_sparse_data.py │ │ ├── plot_lasso_lars.py │ │ ├── plot_lasso_model_selection.py │ │ ├── plot_logistic.py │ │ ├── plot_logistic_l1_l2_sparsity.py │ │ ├── plot_logistic_multinomial.py │ │ ├── plot_logistic_path.py │ │ ├── plot_multi_task_lasso_support.py │ │ ├── plot_nnls.py │ │ ├── plot_ols.py │ │ ├── plot_ols_3d.py │ │ ├── plot_ols_ridge_variance.py │ │ ├── plot_omp.py │ │ ├── plot_poisson_regression_non_normal_loss.py │ │ ├── plot_polynomial_interpolation.py │ │ ├── plot_quantile_regression.py │ │ ├── plot_ransac.py │ │ ├── plot_ridge_coeffs.py │ │ ├── plot_ridge_path.py │ │ ├── plot_robust_fit.py │ │ ├── plot_sgd_comparison.py │ │ ├── plot_sgd_early_stopping.py │ │ ├── plot_sgd_iris.py │ │ ├── plot_sgd_loss_functions.py │ │ ├── plot_sgd_penalties.py │ │ ├── plot_sgd_separating_hyperplane.py │ │ ├── plot_sgd_weighted_samples.py │ │ ├── plot_sgdocsvm_vs_ocsvm.py │ │ ├── plot_sparse_logistic_regression_20newsgroups.py │ │ ├── plot_sparse_logistic_regression_mnist.py │ │ ├── plot_theilsen.py │ │ └── plot_tweedie_regression_insurance_claims.py │ ├── manifold/ │ │ ├── README.txt │ │ ├── plot_compare_methods.py │ │ ├── plot_lle_digits.py │ │ ├── plot_manifold_sphere.py │ │ ├── plot_mds.py │ │ ├── plot_swissroll.py │ │ └── plot_t_sne_perplexity.py │ ├── miscellaneous/ │ │ ├── README.txt │ │ ├── plot_anomaly_comparison.py │ │ ├── plot_changed_only_pprint_parameter.py │ │ ├── plot_display_object_visualization.py │ │ ├── plot_isotonic_regression.py │ │ ├── plot_johnson_lindenstrauss_bound.py │ │ ├── plot_kernel_approximation.py │ │ ├── plot_kernel_ridge_regression.py │ │ ├── plot_multilabel.py │ │ ├── plot_multioutput_face_completion.py │ │ ├── plot_partial_dependence_visualization_api.py │ │ ├── plot_pipeline_display.py │ │ └── plot_roc_curve_visualization_api.py │ ├── mixture/ │ │ ├── README.txt │ │ ├── plot_concentration_prior.py │ │ ├── plot_gmm.py │ │ ├── plot_gmm_covariances.py │ │ ├── plot_gmm_pdf.py │ │ ├── plot_gmm_selection.py │ │ └── plot_gmm_sin.py │ ├── model_selection/ │ │ ├── README.txt │ │ ├── grid_search_text_feature_extraction.py │ │ ├── plot_confusion_matrix.py │ │ ├── plot_cv_indices.py │ │ ├── plot_cv_predict.py │ │ ├── plot_det.py │ │ ├── plot_grid_search_digits.py │ │ ├── plot_grid_search_refit_callable.py │ │ ├── plot_grid_search_stats.py │ │ ├── plot_learning_curve.py │ │ ├── plot_multi_metric_evaluation.py │ │ ├── plot_nested_cross_validation_iris.py │ │ ├── plot_permutation_tests_for_classification.py │ │ ├── plot_precision_recall.py │ │ ├── plot_randomized_search.py │ │ ├── plot_roc.py │ │ ├── plot_roc_crossval.py │ │ ├── plot_successive_halving_heatmap.py │ │ ├── plot_successive_halving_iterations.py │ │ ├── plot_train_error_vs_test_error.py │ │ ├── plot_underfitting_overfitting.py │ │ └── plot_validation_curve.py │ ├── multioutput/ │ │ ├── README.txt │ │ └── plot_classifier_chain_yeast.py │ ├── neighbors/ │ │ ├── README.txt │ │ ├── approximate_nearest_neighbors.py │ │ ├── plot_caching_nearest_neighbors.py │ │ ├── plot_classification.py │ │ ├── plot_digits_kde_sampling.py │ │ ├── plot_kde_1d.py │ │ ├── plot_lof_novelty_detection.py │ │ ├── plot_lof_outlier_detection.py │ │ ├── plot_nca_classification.py │ │ ├── plot_nca_dim_reduction.py │ │ ├── plot_nca_illustration.py │ │ ├── plot_nearest_centroid.py │ │ ├── plot_regression.py │ │ └── plot_species_kde.py │ ├── neural_networks/ │ │ ├── README.txt │ │ ├── plot_mlp_alpha.py │ │ ├── plot_mlp_training_curves.py │ │ ├── plot_mnist_filters.py │ │ └── plot_rbm_logistic_classification.py │ ├── preprocessing/ │ │ ├── README.txt │ │ ├── plot_all_scaling.py │ │ ├── plot_discretization.py │ │ ├── plot_discretization_classification.py │ │ ├── plot_discretization_strategies.py │ │ ├── plot_map_data_to_normal.py │ │ └── plot_scaling_importance.py │ ├── release_highlights/ │ │ ├── README.txt │ │ ├── plot_release_highlights_0_22_0.py │ │ ├── plot_release_highlights_0_23_0.py │ │ ├── plot_release_highlights_0_24_0.py │ │ └── plot_release_highlights_1_0_0.py │ ├── semi_supervised/ │ │ ├── README.txt │ │ ├── plot_label_propagation_digits.py │ │ ├── plot_label_propagation_digits_active_learning.py │ │ ├── plot_label_propagation_structure.py │ │ ├── plot_self_training_varying_threshold.py │ │ ├── plot_semi_supervised_newsgroups.py │ │ └── plot_semi_supervised_versus_svm_iris.py │ ├── svm/ │ │ ├── README.txt │ │ ├── plot_custom_kernel.py │ │ ├── plot_iris_svc.py │ │ ├── plot_linearsvc_support_vectors.py │ │ ├── plot_oneclass.py │ │ ├── plot_rbf_parameters.py │ │ ├── plot_separating_hyperplane.py │ │ ├── plot_separating_hyperplane_unbalanced.py │ │ ├── plot_svm_anova.py │ │ ├── plot_svm_kernels.py │ │ ├── plot_svm_margin.py │ │ ├── plot_svm_nonlinear.py │ │ ├── plot_svm_regression.py │ │ ├── plot_svm_scale_c.py │ │ ├── plot_svm_tie_breaking.py │ │ └── plot_weighted_samples.py │ ├── text/ │ │ ├── README.txt │ │ ├── plot_document_classification_20newsgroups.py │ │ ├── plot_document_clustering.py │ │ └── plot_hashing_vs_dict_vectorizer.py │ └── tree/ │ ├── README.txt │ ├── plot_cost_complexity_pruning.py │ ├── plot_iris_dtc.py │ ├── plot_tree_regression.py │ ├── plot_tree_regression_multioutput.py │ └── plot_unveil_tree_structure.py ├── lgtm.yml ├── maint_tools/ │ ├── check_pxd_in_installation.py │ ├── create_issue_from_juint.py │ ├── sort_whats_new.py │ ├── test_docstrings.py │ └── whats_missing.sh ├── pyproject.toml ├── setup.cfg ├── setup.py └── sklearn/ ├── __check_build/ │ ├── __init__.py │ ├── _check_build.pyx │ └── setup.py ├── __init__.py ├── _build_utils/ │ ├── __init__.py │ ├── openmp_helpers.py │ └── pre_build_helpers.py ├── _config.py ├── _distributor_init.py ├── _isotonic.pyx ├── _loss/ │ ├── __init__.py │ ├── glm_distribution.py │ └── tests/ │ ├── __init__.py │ └── test_glm_distribution.py ├── _min_dependencies.py ├── base.py ├── calibration.py ├── cluster/ │ ├── __init__.py │ ├── _affinity_propagation.py │ ├── _agglomerative.py │ ├── _bicluster.py │ ├── _birch.py │ ├── _dbscan.py │ ├── _dbscan_inner.pyx │ ├── _feature_agglomeration.py │ ├── _hierarchical_fast.pyx │ ├── _k_means_common.pxd │ ├── _k_means_common.pyx │ ├── _k_means_elkan.pyx │ ├── _k_means_lloyd.pyx │ ├── _k_means_minibatch.pyx │ ├── _kmeans.py │ ├── _mean_shift.py │ ├── _optics.py │ ├── _spectral.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── common.py │ ├── test_affinity_propagation.py │ ├── test_bicluster.py │ ├── test_birch.py │ ├── test_dbscan.py │ ├── test_feature_agglomeration.py │ ├── test_hierarchical.py │ ├── test_k_means.py │ ├── test_mean_shift.py │ ├── test_optics.py │ └── test_spectral.py ├── compose/ │ ├── __init__.py │ ├── _column_transformer.py │ ├── _target.py │ └── tests/ │ ├── __init__.py │ ├── test_column_transformer.py │ └── test_target.py ├── conftest.py ├── covariance/ │ ├── __init__.py │ ├── _elliptic_envelope.py │ ├── _empirical_covariance.py │ ├── _graph_lasso.py │ ├── _robust_covariance.py │ ├── _shrunk_covariance.py │ └── tests/ │ ├── __init__.py │ ├── test_covariance.py │ ├── test_elliptic_envelope.py │ ├── test_graphical_lasso.py │ └── test_robust_covariance.py ├── cross_decomposition/ │ ├── __init__.py │ ├── _pls.py │ └── tests/ │ ├── __init__.py │ └── test_pls.py ├── datasets/ │ ├── __init__.py │ ├── _base.py │ ├── _california_housing.py │ ├── _covtype.py │ ├── _kddcup99.py │ ├── _lfw.py │ ├── _olivetti_faces.py │ ├── _openml.py │ ├── _rcv1.py │ ├── _samples_generator.py │ ├── _species_distributions.py │ ├── _svmlight_format_fast.pyx │ ├── _svmlight_format_io.py │ ├── _twenty_newsgroups.py │ ├── data/ │ │ ├── __init__.py │ │ ├── boston_house_prices.csv │ │ ├── breast_cancer.csv │ │ ├── iris.csv │ │ ├── linnerud_exercise.csv │ │ ├── linnerud_physiological.csv │ │ └── wine_data.csv │ ├── descr/ │ │ ├── __init__.py │ │ ├── boston_house_prices.rst │ │ ├── breast_cancer.rst │ │ ├── california_housing.rst │ │ ├── covtype.rst │ │ ├── diabetes.rst │ │ ├── digits.rst │ │ ├── iris.rst │ │ ├── kddcup99.rst │ │ ├── lfw.rst │ │ ├── linnerud.rst │ │ ├── olivetti_faces.rst │ │ ├── rcv1.rst │ │ ├── twenty_newsgroups.rst │ │ └── wine_data.rst │ ├── images/ │ │ ├── README.txt │ │ └── __init__.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── conftest.py │ ├── data/ │ │ ├── __init__.py │ │ ├── openml/ │ │ │ ├── __init__.py │ │ │ ├── id_1/ │ │ │ │ └── __init__.py │ │ │ ├── id_1119/ │ │ │ │ └── __init__.py │ │ │ ├── id_2/ │ │ │ │ └── __init__.py │ │ │ ├── id_292/ │ │ │ │ └── __init__.py │ │ │ ├── id_3/ │ │ │ │ └── __init__.py │ │ │ ├── id_40589/ │ │ │ │ └── __init__.py │ │ │ ├── id_40675/ │ │ │ │ └── __init__.py │ │ │ ├── id_40945/ │ │ │ │ └── __init__.py │ │ │ ├── id_40966/ │ │ │ │ └── __init__.py │ │ │ ├── id_42585/ │ │ │ │ └── __init__.py │ │ │ ├── id_561/ │ │ │ │ └── __init__.py │ │ │ ├── id_61/ │ │ │ │ └── __init__.py │ │ │ └── id_62/ │ │ │ └── __init__.py │ │ ├── svmlight_classification.txt │ │ ├── svmlight_invalid.txt │ │ ├── svmlight_invalid_order.txt │ │ └── svmlight_multilabel.txt │ ├── test_20news.py │ ├── test_base.py │ ├── test_california_housing.py │ ├── test_common.py │ ├── test_covtype.py │ ├── test_kddcup99.py │ ├── test_lfw.py │ ├── test_olivetti_faces.py │ ├── test_openml.py │ ├── test_rcv1.py │ ├── test_samples_generator.py │ └── test_svmlight_format.py ├── decomposition/ │ ├── __init__.py │ ├── _base.py │ ├── _cdnmf_fast.pyx │ ├── _dict_learning.py │ ├── _factor_analysis.py │ ├── _fastica.py │ ├── _incremental_pca.py │ ├── _kernel_pca.py │ ├── _lda.py │ ├── _nmf.py │ ├── _online_lda_fast.pyx │ ├── _pca.py │ ├── _sparse_pca.py │ ├── _truncated_svd.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_dict_learning.py │ ├── test_factor_analysis.py │ ├── test_fastica.py │ ├── test_incremental_pca.py │ ├── test_kernel_pca.py │ ├── test_nmf.py │ ├── test_online_lda.py │ ├── test_pca.py │ ├── test_sparse_pca.py │ └── test_truncated_svd.py ├── discriminant_analysis.py ├── dummy.py ├── ensemble/ │ ├── __init__.py │ ├── _bagging.py │ ├── _base.py │ ├── _forest.py │ ├── _gb.py │ ├── _gb_losses.py │ ├── _gradient_boosting.pyx │ ├── _hist_gradient_boosting/ │ │ ├── __init__.py │ │ ├── _binning.pyx │ │ ├── _bitset.pxd │ │ ├── _bitset.pyx │ │ ├── _gradient_boosting.pyx │ │ ├── _loss.pyx │ │ ├── _predictor.pyx │ │ ├── binning.py │ │ ├── common.pxd │ │ ├── common.pyx │ │ ├── gradient_boosting.py │ │ ├── grower.py │ │ ├── histogram.pyx │ │ ├── loss.py │ │ ├── predictor.py │ │ ├── splitting.pyx │ │ ├── tests/ │ │ │ ├── __init__.py │ │ │ ├── test_binning.py │ │ │ ├── test_bitset.py │ │ │ ├── test_compare_lightgbm.py │ │ │ ├── test_gradient_boosting.py │ │ │ ├── test_grower.py │ │ │ ├── test_histogram.py │ │ │ ├── test_loss.py │ │ │ ├── test_monotonic_contraints.py │ │ │ ├── test_predictor.py │ │ │ ├── test_splitting.py │ │ │ └── test_warm_start.py │ │ └── utils.pyx │ ├── _iforest.py │ ├── _stacking.py │ ├── _voting.py │ ├── _weight_boosting.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_bagging.py │ ├── test_base.py │ ├── test_common.py │ ├── test_forest.py │ ├── test_gradient_boosting.py │ ├── test_gradient_boosting_loss_functions.py │ ├── test_iforest.py │ ├── test_stacking.py │ ├── test_voting.py │ └── test_weight_boosting.py ├── exceptions.py ├── experimental/ │ ├── __init__.py │ ├── enable_halving_search_cv.py │ ├── enable_hist_gradient_boosting.py │ ├── enable_iterative_imputer.py │ └── tests/ │ ├── __init__.py │ ├── test_enable_hist_gradient_boosting.py │ ├── test_enable_iterative_imputer.py │ └── test_enable_successive_halving.py ├── externals/ │ ├── README │ ├── __init__.py │ ├── _arff.py │ ├── _lobpcg.py │ ├── _packaging/ │ │ ├── __init__.py │ │ ├── _structures.py │ │ └── version.py │ ├── _pilutil.py │ └── conftest.py ├── feature_extraction/ │ ├── __init__.py │ ├── _dict_vectorizer.py │ ├── _hash.py │ ├── _hashing_fast.pyx │ ├── _stop_words.py │ ├── image.py │ ├── setup.py │ ├── tests/ │ │ ├── __init__.py │ │ ├── test_dict_vectorizer.py │ │ ├── test_feature_hasher.py │ │ ├── test_image.py │ │ └── test_text.py │ └── text.py ├── feature_selection/ │ ├── __init__.py │ ├── _base.py │ ├── _from_model.py │ ├── _mutual_info.py │ ├── _rfe.py │ ├── _sequential.py │ ├── _univariate_selection.py │ ├── _variance_threshold.py │ └── tests/ │ ├── __init__.py │ ├── test_base.py │ ├── test_chi2.py │ ├── test_feature_select.py │ ├── test_from_model.py │ ├── test_mutual_info.py │ ├── test_rfe.py │ ├── test_sequential.py │ └── test_variance_threshold.py ├── gaussian_process/ │ ├── __init__.py │ ├── _gpc.py │ ├── _gpr.py │ ├── kernels.py │ └── tests/ │ ├── __init__.py │ ├── _mini_sequence_kernel.py │ ├── test_gpc.py │ ├── test_gpr.py │ └── test_kernels.py ├── impute/ │ ├── __init__.py │ ├── _base.py │ ├── _iterative.py │ ├── _knn.py │ └── tests/ │ ├── __init__.py │ ├── test_base.py │ ├── test_common.py │ ├── test_impute.py │ └── test_knn.py ├── inspection/ │ ├── __init__.py │ ├── _partial_dependence.py │ ├── _permutation_importance.py │ ├── _plot/ │ │ ├── __init__.py │ │ ├── partial_dependence.py │ │ └── tests/ │ │ ├── __init__.py │ │ └── test_plot_partial_dependence.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_partial_dependence.py │ └── test_permutation_importance.py ├── isotonic.py ├── kernel_approximation.py ├── kernel_ridge.py ├── linear_model/ │ ├── __init__.py │ ├── _base.py │ ├── _bayes.py │ ├── _cd_fast.pyx │ ├── _coordinate_descent.py │ ├── _glm/ │ │ ├── __init__.py │ │ ├── glm.py │ │ ├── link.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── test_glm.py │ │ └── test_link.py │ ├── _huber.py │ ├── _least_angle.py │ ├── _logistic.py │ ├── _omp.py │ ├── _passive_aggressive.py │ ├── _perceptron.py │ ├── _quantile.py │ ├── _ransac.py │ ├── _ridge.py │ ├── _sag.py │ ├── _sag_fast.pyx.tp │ ├── _sgd_fast.pxd │ ├── _sgd_fast.pyx │ ├── _sgd_fast_helpers.h │ ├── _stochastic_gradient.py │ ├── _theil_sen.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_base.py │ ├── test_bayes.py │ ├── test_common.py │ ├── test_coordinate_descent.py │ ├── test_huber.py │ ├── test_least_angle.py │ ├── test_logistic.py │ ├── test_omp.py │ ├── test_passive_aggressive.py │ ├── test_perceptron.py │ ├── test_quantile.py │ ├── test_ransac.py │ ├── test_ridge.py │ ├── test_sag.py │ ├── test_sgd.py │ ├── test_sparse_coordinate_descent.py │ └── test_theil_sen.py ├── manifold/ │ ├── __init__.py │ ├── _barnes_hut_tsne.pyx │ ├── _isomap.py │ ├── _locally_linear.py │ ├── _mds.py │ ├── _spectral_embedding.py │ ├── _t_sne.py │ ├── _utils.pyx │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_isomap.py │ ├── test_locally_linear.py │ ├── test_mds.py │ ├── test_spectral_embedding.py │ └── test_t_sne.py ├── metrics/ │ ├── __init__.py │ ├── _base.py │ ├── _classification.py │ ├── _dist_metrics.pxd │ ├── _dist_metrics.pyx │ ├── _pairwise_fast.pyx │ ├── _plot/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── confusion_matrix.py │ │ ├── det_curve.py │ │ ├── precision_recall_curve.py │ │ ├── roc_curve.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── test_base.py │ │ ├── test_common_curve_display.py │ │ ├── test_confusion_matrix_display.py │ │ ├── test_det_curve_display.py │ │ ├── test_plot_confusion_matrix.py │ │ ├── test_plot_curve_common.py │ │ ├── test_plot_det_curve.py │ │ ├── test_plot_precision_recall.py │ │ ├── test_plot_roc_curve.py │ │ ├── test_precision_recall_display.py │ │ └── test_roc_curve_display.py │ ├── _ranking.py │ ├── _regression.py │ ├── _scorer.py │ ├── cluster/ │ │ ├── __init__.py │ │ ├── _bicluster.py │ │ ├── _expected_mutual_info_fast.pyx │ │ ├── _supervised.py │ │ ├── _unsupervised.py │ │ ├── setup.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── test_bicluster.py │ │ ├── test_common.py │ │ ├── test_supervised.py │ │ └── test_unsupervised.py │ ├── pairwise.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_classification.py │ ├── test_common.py │ ├── test_dist_metrics.py │ ├── test_pairwise.py │ ├── test_ranking.py │ ├── test_regression.py │ └── test_score_objects.py ├── mixture/ │ ├── __init__.py │ ├── _base.py │ ├── _bayesian_mixture.py │ ├── _gaussian_mixture.py │ └── tests/ │ ├── __init__.py │ ├── test_bayesian_mixture.py │ ├── test_gaussian_mixture.py │ └── test_mixture.py ├── model_selection/ │ ├── __init__.py │ ├── _search.py │ ├── _search_successive_halving.py │ ├── _split.py │ ├── _validation.py │ └── tests/ │ ├── __init__.py │ ├── common.py │ ├── test_search.py │ ├── test_split.py │ ├── test_successive_halving.py │ └── test_validation.py ├── multiclass.py ├── multioutput.py ├── naive_bayes.py ├── neighbors/ │ ├── __init__.py │ ├── _ball_tree.pyx │ ├── _base.py │ ├── _binary_tree.pxi │ ├── _classification.py │ ├── _distance_metric.py │ ├── _graph.py │ ├── _kd_tree.pyx │ ├── _kde.py │ ├── _lof.py │ ├── _nca.py │ ├── _nearest_centroid.py │ ├── _partition_nodes.pxd │ ├── _partition_nodes.pyx │ ├── _quad_tree.pxd │ ├── _quad_tree.pyx │ ├── _regression.py │ ├── _unsupervised.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_ball_tree.py │ ├── test_graph.py │ ├── test_kd_tree.py │ ├── test_kde.py │ ├── test_lof.py │ ├── test_nca.py │ ├── test_nearest_centroid.py │ ├── test_neighbors.py │ ├── test_neighbors_pipeline.py │ ├── test_neighbors_tree.py │ └── test_quad_tree.py ├── neural_network/ │ ├── __init__.py │ ├── _base.py │ ├── _multilayer_perceptron.py │ ├── _rbm.py │ ├── _stochastic_optimizers.py │ └── tests/ │ ├── __init__.py │ ├── test_base.py │ ├── test_mlp.py │ ├── test_rbm.py │ └── test_stochastic_optimizers.py ├── pipeline.py ├── preprocessing/ │ ├── __init__.py │ ├── _csr_polynomial_expansion.pyx │ ├── _data.py │ ├── _discretization.py │ ├── _encoders.py │ ├── _function_transformer.py │ ├── _label.py │ ├── _polynomial.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_common.py │ ├── test_data.py │ ├── test_discretization.py │ ├── test_encoders.py │ ├── test_function_transformer.py │ ├── test_label.py │ └── test_polynomial.py ├── random_projection.py ├── semi_supervised/ │ ├── __init__.py │ ├── _label_propagation.py │ ├── _self_training.py │ └── tests/ │ ├── __init__.py │ ├── test_label_propagation.py │ └── test_self_training.py ├── setup.py ├── svm/ │ ├── __init__.py │ ├── _base.py │ ├── _bounds.py │ ├── _classes.py │ ├── _liblinear.pxi │ ├── _liblinear.pyx │ ├── _libsvm.pxi │ ├── _libsvm.pyx │ ├── _libsvm_sparse.pyx │ ├── _newrand.pyx │ ├── setup.py │ ├── src/ │ │ ├── liblinear/ │ │ │ ├── COPYRIGHT │ │ │ ├── _cython_blas_helpers.h │ │ │ ├── liblinear_helper.c │ │ │ ├── linear.cpp │ │ │ ├── linear.h │ │ │ ├── tron.cpp │ │ │ └── tron.h │ │ ├── libsvm/ │ │ │ ├── LIBSVM_CHANGES │ │ │ ├── _svm_cython_blas_helpers.h │ │ │ ├── libsvm_helper.c │ │ │ ├── libsvm_sparse_helper.c │ │ │ ├── libsvm_template.cpp │ │ │ ├── svm.cpp │ │ │ └── svm.h │ │ └── newrand/ │ │ └── newrand.h │ └── tests/ │ ├── __init__.py │ ├── test_bounds.py │ ├── test_sparse.py │ └── test_svm.py ├── tests/ │ ├── __init__.py │ ├── test_base.py │ ├── test_build.py │ ├── test_calibration.py │ ├── test_check_build.py │ ├── test_common.py │ ├── test_config.py │ ├── test_discriminant_analysis.py │ ├── test_docstring_parameters.py │ ├── test_dummy.py │ ├── test_init.py │ ├── test_isotonic.py │ ├── test_kernel_approximation.py │ ├── test_kernel_ridge.py │ ├── test_metaestimators.py │ ├── test_min_dependencies_readme.py │ ├── test_multiclass.py │ ├── test_multioutput.py │ ├── test_naive_bayes.py │ ├── test_pipeline.py │ └── test_random_projection.py ├── tree/ │ ├── __init__.py │ ├── _classes.py │ ├── _criterion.pxd │ ├── _criterion.pyx │ ├── _export.py │ ├── _reingold_tilford.py │ ├── _splitter.pxd │ ├── _splitter.pyx │ ├── _tree.pxd │ ├── _tree.pyx │ ├── _utils.pxd │ ├── _utils.pyx │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_export.py │ ├── test_reingold_tilford.py │ └── test_tree.py └── utils/ ├── __init__.py ├── _arpack.py ├── _cython_blas.pxd ├── _cython_blas.pyx ├── _encode.py ├── _estimator_html_repr.py ├── _fast_dict.pxd ├── _fast_dict.pyx ├── _joblib.py ├── _logistic_sigmoid.pyx ├── _mask.py ├── _mocking.py ├── _openmp_helpers.pyx ├── _pprint.py ├── _random.pxd ├── _random.pyx ├── _readonly_array_wrapper.pyx ├── _seq_dataset.pxd.tp ├── _seq_dataset.pyx.tp ├── _show_versions.py ├── _tags.py ├── _testing.py ├── _typedefs.pxd ├── _typedefs.pyx ├── _weight_vector.pxd.tp ├── _weight_vector.pyx.tp ├── arrayfuncs.pyx ├── class_weight.py ├── deprecation.py ├── estimator_checks.py ├── extmath.py ├── fixes.py ├── graph.py ├── metaestimators.py ├── multiclass.py ├── murmurhash.pxd ├── murmurhash.pyx ├── optimize.py ├── random.py ├── setup.py ├── sparsefuncs.py ├── sparsefuncs_fast.pyx ├── src/ │ ├── MurmurHash3.cpp │ └── MurmurHash3.h ├── stats.py ├── tests/ │ ├── __init__.py │ ├── conftest.py │ ├── test_arpack.py │ ├── test_arrayfuncs.py │ ├── test_class_weight.py │ ├── test_cython_blas.py │ ├── test_cython_templating.py │ ├── test_deprecation.py │ ├── test_encode.py │ ├── test_estimator_checks.py │ ├── test_estimator_html_repr.py │ ├── test_extmath.py │ ├── test_fast_dict.py │ ├── test_fixes.py │ ├── test_graph.py │ ├── test_metaestimators.py │ ├── test_mocking.py │ ├── test_multiclass.py │ ├── test_murmurhash.py │ ├── test_optimize.py │ ├── test_parallel.py │ ├── test_pprint.py │ ├── test_random.py │ ├── test_readonly_wrapper.py │ ├── test_seq_dataset.py │ ├── test_shortest_path.py │ ├── test_show_versions.py │ ├── test_sparsefuncs.py │ ├── test_stats.py │ ├── test_tags.py │ ├── test_testing.py │ ├── test_utils.py │ ├── test_validation.py │ └── test_weight_vector.py └── validation.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .binder/postBuild ================================================ #!/bin/bash set -e # This script is called in a binder context. When this script is called, we are # inside a git checkout of the scikit-learn/scikit-learn repo. This script is # generating notebooks from the scikit-learn python examples. if [[ ! -f /.dockerenv ]]; then echo "This script was written for repo2docker and is supposed to run inside a docker container." echo "Exiting because this script can delete data if run outside of a docker container." exit 1 fi # Back up content we need from the scikit-learn repo TMP_CONTENT_DIR=/tmp/scikit-learn mkdir -p $TMP_CONTENT_DIR cp -r examples .binder $TMP_CONTENT_DIR # delete everything in current directory including dot files and dot folders find . -delete # Generate notebooks and remove other files from examples folder GENERATED_NOTEBOOKS_DIR=.generated-notebooks cp -r $TMP_CONTENT_DIR/examples $GENERATED_NOTEBOOKS_DIR find $GENERATED_NOTEBOOKS_DIR -name '*.py' -exec sphx_glr_python_to_jupyter.py '{}' + NON_NOTEBOOKS=$(find $GENERATED_NOTEBOOKS_DIR -type f | grep -v '\.ipynb') rm -f $NON_NOTEBOOKS # Put the .binder folder back (may be useful for debugging purposes) mv $TMP_CONTENT_DIR/.binder . # Final clean up rm -rf $TMP_CONTENT_DIR # This is for compatibility with binder sphinx-gallery integration: this makes # sure that the binder links generated by sphinx-gallery are correct even tough # the repo we use for binder (scikit-learn/scikit-learn) is not the repo of the # generated doc (scikit-learn/scikit-learn.github.io) mkdir notebooks ln -s ../$GENERATED_NOTEBOOKS_DIR notebooks/auto_examples ================================================ FILE: .binder/requirements.txt ================================================ --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn --pre matplotlib scikit-image pandas sphinx-gallery scikit-learn ================================================ FILE: .circleci/artifact_path ================================================ 0/doc/_changed.html ================================================ FILE: .circleci/config.yml ================================================ version: 2.1 jobs: doc-min-dependencies: docker: - image: circleci/python:3.7.7-buster environment: - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - PYTHON_VERSION: 3.7 - NUMPY_VERSION: 'min' - SCIPY_VERSION: 'min' - MATPLOTLIB_VERSION: 'min' - CYTHON_VERSION: 'min' - SCIKIT_IMAGE_VERSION: 'min' - SPHINX_VERSION: 'min' - PANDAS_VERSION: 'min' - SPHINX_GALLERY_VERSION: 'min' - NUMPYDOC_VERSION: 'min' - SPHINX_PROMPT_VERSION: 'min' - SPHINXEXT_OPENGRAPH_VERSION: 'min' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - restore_cache: key: v1-datasets-{{ .Branch }} - restore_cache: keys: - doc-min-deps-ccache-{{ .Branch }} - doc-min-deps-ccache - run: ./build_tools/circle/build_doc.sh - save_cache: key: doc-min-deps-ccache-{{ .Branch }}-{{ .BuildNum }} paths: - ~/.ccache - ~/.cache/pip - save_cache: key: v1-datasets-{{ .Branch }} paths: - ~/scikit_learn_data - store_artifacts: path: doc/_build/html/stable destination: doc - store_artifacts: path: ~/log.txt destination: log.txt doc: docker: - image: circleci/python:3.7.7-buster environment: - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - PYTHON_VERSION: 3 - NUMPY_VERSION: 'latest' - SCIPY_VERSION: 'latest' - MATPLOTLIB_VERSION: 'latest' - CYTHON_VERSION: 'latest' - SCIKIT_IMAGE_VERSION: 'latest' # Bump the sphinx version from time to time. Avoid latest sphinx version # that tends to break things slightly too often - SPHINX_VERSION: 4.2.0 - PANDAS_VERSION: 'latest' - SPHINX_GALLERY_VERSION: 'latest' - NUMPYDOC_VERSION: 'latest' - SPHINX_PROMPT_VERSION: 'latest' - SPHINXEXT_OPENGRAPH_VERSION: 'latest' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - restore_cache: key: v1-datasets-{{ .Branch }} - restore_cache: keys: - doc-ccache-{{ .Branch }} - doc-ccache - run: ./build_tools/circle/build_doc.sh - save_cache: key: doc-ccache-{{ .Branch }}-{{ .BuildNum }} paths: - ~/.ccache - ~/.cache/pip - save_cache: key: v1-datasets-{{ .Branch }} paths: - ~/scikit_learn_data - store_artifacts: path: doc/_build/html/stable destination: doc - store_artifacts: path: ~/log.txt destination: log.txt # Persists generated documentation so that it can be attached and deployed # in the 'deploy' step. - persist_to_workspace: root: doc/_build/html paths: . lint: docker: - image: circleci/python:3.7 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - run: name: dependencies command: sudo pip install flake8 - run: name: linting command: ./build_tools/circle/linting.sh linux-arm64: machine: image: ubuntu-2004:202101-01 resource_class: arm.medium environment: # Use the latest supported version of python - PYTHON_VERSION: '3.9' - OMP_NUM_THREADS: 2 - OPENBLAS_NUM_THREADS: 2 - NUMPY_VERSION: 'latest' - SCIPY_VERSION: 'latest' - CYTHON_VERSION: 'latest' - JOBLIB_VERSION: 'latest' - THREADPOOLCTL_VERSION: 'latest' - PYTEST_VERSION: 'latest' - PYTEST_XDIST_VERSION: 'latest' - TEST_DOCSTRINGS: 'true' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - restore_cache: key: linux-arm64-{{ .Branch }} - run: ./build_tools/circle/build_test_arm.sh - save_cache: key: linux-arm64-{{ .Branch }} paths: - ~/.cache/ccache - ~/.cache/pip - ~/scikit_learn_data # The source build folder. - ~/project/build deploy: docker: - image: circleci/python:3.7 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh # Attach documentation generated in the 'doc' step so that it can be # deployed. - attach_workspace: at: doc/_build/html - run: ls -ltrh doc/_build/html/stable - deploy: command: | if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then bash build_tools/circle/push_doc.sh doc/_build/html/stable fi workflows: version: 2 build-doc-and-deploy: jobs: - lint - doc: requires: - lint - doc-min-dependencies: requires: - lint - deploy: requires: - doc linux-arm64: jobs: - linux-arm64 ================================================ FILE: .codecov.yml ================================================ comment: false coverage: status: project: default: # Commits pushed to main should not make the overall # project coverage decrease by more than 1%: target: auto threshold: 1% patch: default: # Be tolerant on slight code coverage diff on PRs to limit # noisy red coverage status on github PRs. # Note: The coverage stats are still uploaded # to codecov so that PR reviewers can see uncovered lines target: auto threshold: 1% codecov: notify: # Prevent coverage status to upload multiple times for parallel and long # running CI pipelines. This configuration is particularly useful on PRs # to avoid confusion. Note that this value is set to the number of Azure # Pipeline jobs uploading coverage reports. after_n_builds: 6 ignore: - "sklearn/externals" - "sklearn/_build_utils" - "**/setup.py" ================================================ FILE: .coveragerc ================================================ [run] branch = True source = sklearn parallel = True omit = */sklearn/externals/* */sklearn/_build_utils/* */benchmarks/* **/setup.py ================================================ FILE: .git-blame-ignore-revs ================================================ # Since git version 2.23, git-blame has a feature to ignore # certain commits. # # This file contains a list of commits that are not likely what # you are looking for in `git blame`. You can set this file as # a default ignore file for blame by running the following # command. # # $ git config blame.ignoreRevsFile .git-blame-ignore-revs # PR 18948: Migrate code style to Black 82df48934eba1df9a1ed3be98aaace8eada59e6e # PR 20294: Use target_version >= 3.7 in Black 351ace7935a4ea685171cc6d174890f08facd561 # PR 20412: Use experimental_string_processing=true in Black 3ae7c7615343bbd36acece57825d8b0d70fd9da4 # PR 20502: Runs Black on examples 70a185ae59b4362633d18b0d0083abb1b6f7370c ================================================ FILE: .gitattributes ================================================ /doc/whats_new/v*.rst merge=union ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: ['https://numfocus.org/donate-to-scikit-learn'] ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Bug Report description: Create a report to help us reproduce and correct the bug labels: ['Bug: triage'] body: - type: markdown attributes: value: > #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the past issues](https://github.com/scikit-learn/scikit-learn/issues). - type: textarea attributes: label: Describe the bug description: > A clear and concise description of what the bug is. validations: required: true - type: textarea attributes: label: Steps/Code to Reproduce description: | Please add a minimal example that we can reproduce the error by running the code. Be as succinct as possible, do not depend on external data. In short, we are going to copy-paste your code and we expect to get the same result as you. Example: ```python from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation docs = ["Help I have a bug" for i in range(1000)] vectorizer = CountVectorizer(input=docs, analyzer='word') lda_features = vectorizer.fit_transform(docs) lda_model = LatentDirichletAllocation( n_topics=10, learning_method='online', evaluate_every=10, n_jobs=4, ) model = lda_model.fit(lda_features) ``` If the code is too long, feel free to put it in a public gist and link it in the issue: https://gist.github.com. placeholder: | ``` Sample code to reproduce the problem ``` validations: required: true - type: textarea attributes: label: Expected Results description: > Please paste or describe the expected results. placeholder: > Example: No error is thrown. validations: required: true - type: textarea attributes: label: Actual Results description: > Please paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. placeholder: > Please paste or specifically describe the actual output or traceback. validations: required: true - type: textarea attributes: label: Versions description: | Please run the following and paste the output below. ```python import sklearn; sklearn.show_versions() ``` validations: required: true - type: markdown attributes: value: > Thanks for contributing 🎉! ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true contact_links: - name: Discussions url: https://github.com/scikit-learn/scikit-learn/discussions/new about: Ask questions and discuss with other scikit-learn community members - name: Stack Overflow url: https://stackoverflow.com/questions/tagged/scikit-learn about: Please ask and answer usage questions on Stack Overflow - name: Mailing list url: https://mail.python.org/mailman/listinfo/scikit-learn about: General discussions and announcements on the mailing list - name: Gitter url: https://gitter.im/scikit-learn/scikit-learn about: Users and developers can sometimes be found on the gitter channel - name: Blank issue url: https://github.com/scikit-learn/scikit-learn/issues/new about: Please note that Github Discussions should be used in most cases instead ================================================ FILE: .github/ISSUE_TEMPLATE/doc_improvement.yml ================================================ name: Documentation improvement description: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. labels: [Documentation] body: - type: textarea attributes: label: Describe the issue linked to the documentation description: > Tell us about the confusion introduced in the documentation. validations: required: true - type: textarea attributes: label: Suggest a potential alternative/fix description: > Tell us how we could improve the documentation in this regard. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: Feature request description: Suggest a new algorithm, enhancement to an existing algorithm, etc. labels: ['New Feature'] body: - type: markdown attributes: value: > #### If you want to propose a new algorithm, please refer first to the [scikit-learn inclusion criterion](https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms). - type: textarea attributes: label: Describe the workflow you want to enable validations: required: true - type: textarea attributes: label: Describe your proposed solution validations: required: true - type: textarea attributes: label: Describe alternatives you've considered, if relevant - type: textarea attributes: label: Additional context ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ #### Reference Issues/PRs #### What does this implement/fix? Explain your changes. #### Any other comments? ================================================ FILE: .github/labeler-file-extensions.yml ================================================ cython: - sklearn/**/*.pyx - sklearn/**/*.pxd - sklearn/**/*.pxi # Tempita templates - sklearn/**/*.pyx.tp - sklearn/**/*.pxd.tp - sklearn/**/*.pxi.tp ================================================ FILE: .github/labeler-module.yml ================================================ module:cluster: - sklearn/cluster/**/* module:common: - sklearn/common/**/* module:compose: - sklearn/compose/**/* module:covariance: - sklearn/covariance/**/* module:cross_decomposition: - sklearn/cross_decomposition/**/* module:datasets: - sklearn/datasets/**/* module:decomposition: - sklearn/decomposition/**/* module:ensemble: - sklearn/ensemble/**/* module:feature_extraction: - sklearn/feature_extraction/**/* module:feature_selection: - sklearn/feature_selection/**/* module:gaussian_process: - sklearn/gaussian_process/**/* module:impute: - sklearn/impute/**/* module:inspection: - sklearn/inspection/**/* module:linear_model: - sklearn/linear_model/**/* module:manifold: - sklearn/manifold/**/* module:metrics: - sklearn/metrics/**/* module:mixture: - sklearn/mixture/**/* module:model_selection: - sklearn/model_selection/**/* module:naive_bayes: - sklearn/naive_bayes.py module:neighbors: - sklearn/neighbors/**/* module:neural_network: - sklearn/neural_network/**/* module:pipeline: - sklearn/pipeline.py module:preprocessing: - sklearn/preprocessing/**/* module:semi_supervised: - sklearn/semi_supervised/**/* module:svm: - sklearn/svm/**/* module:tree: - sklearn/tree/**/* module:utils: - sklearn/utils/**/* ================================================ FILE: .github/scripts/label_title_regex.py ================================================ """Labels PRs based on title. Must be run in a github action with the pull_request_target event.""" from github import Github import os import json import re context_dict = json.loads(os.getenv("CONTEXT_GITHUB")) repo = context_dict["repository"] g = Github(context_dict["token"]) repo = g.get_repo(repo) pr_number = context_dict["event"]["number"] issue = repo.get_issue(number=pr_number) title = issue.title regex_to_labels = [(r"\bDOC\b", "Documentation"), (r"\bCI\b", "Build / CI")] labels_to_add = [label for regex, label in regex_to_labels if re.search(regex, title)] if labels_to_add: issue.add_to_labels(*labels_to_add) ================================================ FILE: .github/workflows/assign.yml ================================================ name: Assign on: issue_comment: types: created jobs: one: runs-on: ubuntu-latest if: >- (github.event.comment.body == 'take' || github.event.comment.body == 'Take') && !github.event.issue.assignee steps: - run: | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted ================================================ FILE: .github/workflows/check-changelog.yml ================================================ name: Check Changelog # This check makes sure that the changelog is properly updated # when a PR introduces a change in a test file. # To bypass this check, label the PR with "No Changelog Needed". on: pull_request: types: [opened, edited, labeled, unlabeled, synchronize] jobs: check: runs-on: ubuntu-latest if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV - uses: actions/checkout@v2 with: fetch-depth: '0' - name: Check the changelog run: | set -xe changed_files=$(git diff --name-only origin/main) # Changelog should be updated only if tests have been modified if [[ ! "$changed_files" =~ tests ]] then exit 0 fi all_changelogs=$(cat ./doc/whats_new/v*.rst) if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]] then echo "Changelog has been updated." # If the pull request is milestoned check the correspondent changelog if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst then expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst) if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]] then echo "Changelog and milestone correspond." else echo "Changelog and milestone do not correspond." echo "If you see this error make sure that the tagged milestone for the PR" echo "and the edited changelog filename properly match." exit 1 fi fi else echo "A Changelog entry is missing." echo "" echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'" echo "to document your change assuming that the PR will be merged" echo "in time for the next release of scikit-learn." echo "" echo "Look at other entries in that file for inspiration and please" echo "reference this pull request using the ':pr:' directive and" echo "credit yourself (and other contributors if applicable) with" echo "the ':user:' directive." echo "" echo "If you see this error and there is already a changelog entry," echo "check that the PR number is correct." echo "" echo" If you believe that this PR does no warrant a changelog" echo "entry, say so in a comment so that a maintainer will label " echo "the PR with 'No Changelog Needed' to bypass this check." exit 1 fi ================================================ FILE: .github/workflows/check-manifest.yml ================================================ name: "Check Manifest" on: schedule: - cron: '0 0 * * *' jobs: check: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: '3.9' - name: Install dependencies # scipy and cython are required to build sdist run: | python -m pip install --upgrade pip pip install check-manifest scipy cython - run: | check-manifest -v ================================================ FILE: .github/workflows/labeler-module.yml ================================================ name: "Pull Request Labeler" on: pull_request_target jobs: triage: runs-on: ubuntu-latest steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true if: github.repository == 'scikit-learn/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" configuration-path: ".github/labeler-module.yml" triage_file_extensions: runs-on: ubuntu-latest steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true if: github.repository == 'scikit-learn/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" ================================================ FILE: .github/workflows/labeler-title-regex.yml ================================================ name: Pull Request Regex Title Labeler on: pull_request_target: types: [opened, edited] permissions: contents: read pull-requests: write jobs: labeler: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: '3.9' - name: Install PyGithub run: pip install -Uq PyGithub - name: Label pull request run: python .github/scripts/label_title_regex.py env: CONTEXT_GITHUB: ${{ toJson(github) }} ================================================ FILE: .github/workflows/publish_pypi.yml ================================================ name: Publish to Pypi on: workflow_dispatch: inputs: version: description: 'Version upload to pypi' required: true pypi_repo: description: 'Repo to upload to (testpypi or pypi)' default: 'testpypi' required: true jobs: publish: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: '3.8' - name: Install dependencies run: | pip install -U wheelhouse_uploader pyyaml - name: Downloading wheels and sdist from staging env: SKLEARN_VERSION: ${{ github.event.inputs.version }} run: | echo "Download $SKLEARN_VERSION wheels and sdist" python -m wheelhouse_uploader fetch \ --version $SKLEARN_VERSION \ --local-folder dist/ \ scikit-learn \ https://pypi.anaconda.org/scikit-learn-wheels-staging/simple/scikit-learn/ - name: Check dist has the correct number of artifacts run: | python build_tools/github/check_wheels.py - name: Publish package to TestPyPI uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.TEST_PYPI_TOKEN }} repository_url: https://test.pypi.org/legacy/ if: ${{ github.event.inputs.pypi_repo == 'testpypi' }} - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} if: ${{ github.event.inputs.pypi_repo == 'pypi' }} ================================================ FILE: .github/workflows/twitter.yml ================================================ # Tweet the URL of a commit on @sklearn_commits whenever a push event # happens on the main branch name: Twitter Push Notification on: push: branches: - main jobs: tweet: name: Twitter Notification runs-on: ubuntu-latest steps: - name: Tweet URL of last commit as @sklearn_commits if: github.repository == 'scikit-learn/scikit-learn' uses: docker://thomasjpfan/twitter-action:0.3 with: args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\"" env: TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} TWITTER_ACCESS_SECRET: ${{ secrets.TWITTER_ACCESS_SECRET }} ================================================ FILE: .github/workflows/unassign.yml ================================================ name: Unassign #Runs when a contributor has unassigned themselves from the issue and adds 'help wanted' on: issues: types: unassigned jobs: one: runs-on: ubuntu-latest steps: - name: if: github.event.issue.state == 'open' run: | echo "Marking issue ${{ github.event.issue.number }} as help wanted" curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels ================================================ FILE: .github/workflows/wheels.yml ================================================ # Workflow to build and test wheels name: Wheel builder on: schedule: # Nightly build at 3:42 A.M. - cron: "42 3 */1 * *" push: branches: - main # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - main - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: jobs: # Check whether to build the wheels and the source tarball check_build_trigger: name: Check build trigger runs-on: ubuntu-latest if: github.repository == 'scikit-learn/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} steps: - name: Checkout scikit-learn uses: actions/checkout@v2 with: ref: ${{ github.event.pull_request.head.sha }} - id: check_build_trigger name: Check build trigger run: bash build_tools/github/check_build_trigger.sh # Build the wheels for Linux, Windows and macOS for Python 3.7 and newer build_wheels: name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}-${{ matrix.manylinux_image }} runs-on: ${{ matrix.os }} needs: check_build_trigger if: needs.check_build_trigger.outputs.build strategy: # Ensure that a wheel builder finishes even if another fails fail-fast: false matrix: os: [windows-latest, ubuntu-latest, macos-latest] python: [37, 38, 39] bitness: [32, 64] manylinux_image: [manylinux1, manylinux2010] include: # Run 32 and 64 bit version in parallel for Linux and Windows - os: windows-latest bitness: 64 platform_id: win_amd64 - os: windows-latest bitness: 32 platform_id: win32 - os: ubuntu-latest bitness: 64 platform_id: manylinux_x86_64 - os: ubuntu-latest bitness: 32 platform_id: manylinux_i686 - os: macos-latest bitness: 64 platform_id: macosx_x86_64 exclude: - os: macos-latest bitness: 32 # Remove manylinux1 from the windows and osx build matrix since # manylinux_image is not used for these platforms - os: windows-latest manylinux_image: manylinux1 - os: macos-latest manylinux_image: manylinux1 steps: - name: Checkout scikit-learn uses: actions/checkout@v1 - name: Setup Python uses: actions/setup-python@v2 with: python-version: '3.9' # update once build dependencies are available - name: Build and test wheels env: CONFTEST_PATH: ${{ github.workspace }}/conftest.py CONFTEST_NAME: conftest.py CIBW_ENVIRONMENT: OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 SKLEARN_SKIP_NETWORK_TESTS=1 SKLEARN_BUILD_PARALLEL=3 MACOSX_DEPLOYMENT_TARGET=10.13 CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }} CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.manylinux_image }} CIBW_MANYLINUX_I686_IMAGE: ${{ matrix.manylinux_image }} CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }} CIBW_TEST_REQUIRES: pytest pandas threadpoolctl CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh CIBW_TEST_COMMAND_WINDOWS: bash {project}/build_tools/github/test_windows_wheels.sh ${{ matrix.python }} ${{ matrix.bitness }} CIBW_BUILD_VERBOSITY: 1 run: bash build_tools/github/build_wheels.sh - name: Store artifacts uses: actions/upload-artifact@v2 with: path: wheelhouse/*.whl # Build the source distribution under Linux build_sdist: name: Source distribution runs-on: ubuntu-latest needs: check_build_trigger if: needs.check_build_trigger.outputs.build steps: - name: Checkout scikit-learn uses: actions/checkout@v1 - name: Setup Python uses: actions/setup-python@v2 with: python-version: '3.9' # update once build dependencies are available - name: Build source distribution run: bash build_tools/github/build_source.sh env: SKLEARN_BUILD_PARALLEL: 3 - name: Test source distribution run: bash build_tools/github/test_source.sh env: OMP_NUM_THREADS: 2 OPENBLAS_NUM_THREADS: 2 SKLEARN_SKIP_NETWORK_TESTS: 1 - name: Store artifacts uses: actions/upload-artifact@v2 with: path: dist/*.tar.gz # Upload the wheels and the source distribution upload_anaconda: name: Upload to Anaconda runs-on: ubuntu-latest needs: [build_wheels, build_sdist] # The artifacts cannot be uploaded on PRs if: github.event_name != 'pull_request' steps: - name: Checkout scikit-learn uses: actions/checkout@v1 - name: Download artifacts uses: actions/download-artifact@v2 with: path: dist - name: Setup Python uses: actions/setup-python@v2 - name: Upload artifacts env: # Secret variables need to be mapped to environment variables explicitly SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} # Force a replacement if the remote file already exists run: bash build_tools/github/upload_anaconda.sh ================================================ FILE: .gitignore ================================================ *.pyc *.so *.pyd *~ .#* *.lprof *.swp *.swo .DS_Store build sklearn/datasets/__config__.py sklearn/**/*.html dist/ MANIFEST doc/_build/ doc/auto_examples/ doc/modules/generated/ doc/datasets/generated/ doc/min_dependency_table.rst doc/min_dependency_substitutions.rst *.pdf pip-log.txt scikit_learn.egg-info/ .coverage coverage *.py,cover .tags* tags covtype.data.gz 20news-18828/ 20news-18828.tar.gz coverages.zip samples.zip doc/coverages.zip doc/samples.zip coverages samples doc/coverages doc/samples *.prof .tox/ .coverage pip-wheel-metadata lfw_preprocessed/ nips2010_pdf/ *.nt.bz2 *.tar.gz *.tgz examples/cluster/joblib reuters/ benchmarks/bench_covertype_data/ *.prefs .pydevproject .idea .vscode *.c *.cpp !/**/src/**/*.c !/**/src/**/*.cpp *.sln *.pyproj # Used by py.test .cache .pytest_cache/ _configtest.o.d # Used by mypy .mypy_cache/ # files generated from a template sklearn/utils/_seq_dataset.pyx sklearn/utils/_seq_dataset.pxd sklearn/utils/_weight_vector.pyx sklearn/utils/_weight_vector.pxd sklearn/linear_model/_sag_fast.pyx ================================================ FILE: .mailmap ================================================ Alexandre Gramfort Alexandre Gramfort Alexandre Gramfort Alexandre Saint Andreas Mueller Andreas Mueller Andreas Mueller Andreas Mueller Andreas Mueller Andreas Mueller Arnaud Joly Arnaud Joly Arnaud Joly Anne-Laure Fouque Ariel Rokem arokem Bala Subrahmanyam Varanasi Bertrand Thirion Brandyn A. White Brian Cheung Brian Cheung Brian Cheung Brian Holt Christian Osendorfer Clay Woolam Danny Sullivan Denis Engemann Denis Engemann Denis Engemann Denis Engemann dengemann Diego Molla DraXus draxus Edouard DUCHESNAY Edouard DUCHESNAY Edouard DUCHESNAY Emmanuelle Gouillart Emmanuelle Gouillart Eustache Diemert Fabian Pedregosa Fabian Pedregosa Fabian Pedregosa Federico Vaggi Federico Vaggi Gael Varoquaux Gael Varoquaux Gael Varoquaux Giorgio Patrini Giorgio Patrini Gilles Louppe Hamzeh Alsalhi <93hamsal@gmail.com> Harikrishnan S Hendrik Heuer Henry Lin Hrishikesh Huilgolkar Hugo Bowne-Anderson Imaculate Immanuel Bayer Jacob Schreiber Jacob Schreiber Jake VanderPlas Jake VanderPlas Jake VanderPlas James Bergstra Jaques Grobler Jan Schlüter Jean Kossaifi Jean Kossaifi Jean Kossaifi Joel Nothman Kyle Kastner Lars Buitinck Lars Buitinck Lars Buitinck Lars Buitinck Lars Buitinck Loic Esteve Manoj Kumar Matthieu Perrot Maheshakya Wijewardena Michael Bommarito Michael Eickenberg Michael Eickenberg Samuel Charron Sergio Medina Nelle Varoquaux Nelle Varoquaux Nelle Varoquaux Nicolas Goix Nicolas Pinto Noel Dawe Noel Dawe Olivier Grisel Olivier Grisel Olivier Hervieu Paul Butler Peter Prettenhofer Raghav RV Raghav RV Robert Layton Roman Sinayev Roman Sinayev Ronald Phlypo Satrajit Ghosh Sebastian Raschka Sebastian Raschka Shiqiao Du Shiqiao Du Thomas Unterthiner Tim Sheerman-Chase Vincent Dubourg Vincent Dubourg Vincent Michel Vincent Michel Vincent Michel Vincent Michel Vincent Michel Vincent Schut Virgile Fritsch Virgile Fritsch Vlad Niculae Wei Li Wei Li X006 Xinfan Meng Yannick Schwartz Yannick Schwartz Yannick Schwartz ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black rev: 21.6b0 hooks: - id: black - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 hooks: - id: flake8 types: [file, python] - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.782 hooks: - id: mypy files: sklearn/ additional_dependencies: [pytest==6.2.4] ================================================ FILE: .travis.yml ================================================ # Make it explicit that we favor the # new container-based Travis workers language: python dist: xenial cache: apt: true directories: - $HOME/.cache/pip - $HOME/.ccache env: global: - CPU_COUNT=3 - TEST_DIR=/tmp/sklearn # Test directory for continuous integration jobs - PYTEST_VERSION=latest - OMP_NUM_THREADS=2 - OPENBLAS_NUM_THREADS=2 - SKLEARN_BUILD_PARALLEL=3 - SKLEARN_SKIP_NETWORK_TESTS=1 - PYTHONUNBUFFERED=1 # Custom environment variables for the ARM wheel builder - CIBW_BUILD_VERBOSITY=1 - CIBW_TEST_COMMAND="bash {project}/build_tools/travis/test_wheels.sh" - CIBW_ENVIRONMENT="CPU_COUNT=2 OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 SKLEARN_BUILD_PARALLEL=10 SKLEARN_SKIP_NETWORK_TESTS=1 PYTHONUNBUFFERED=1" jobs: include: # Linux environments to build the scikit-learn wheels for the ARM64 # architecture and Python 3.7 and newer. This is used both at release time # with the manual trigger in the commit message in the release branch and as # a scheduled task to build the weekly dev build on the main branch. The # weekly frequency is meant to avoid depleting the Travis CI credits too # fast. - python: 3.7 os: linux arch: arm64-graviton2 dist: focal virt: lxd group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true - CIBW_BUILD=cp37-manylinux_aarch64 - python: 3.8 os: linux arch: arm64-graviton2 dist: focal virt: lxd group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true - CIBW_BUILD=cp38-manylinux_aarch64 - python: 3.9 os: linux arch: arm64-graviton2 dist: focal virt: lxd group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true - CIBW_BUILD=cp39-manylinux_aarch64 install: source build_tools/travis/install.sh || travis_terminate 1 script: source build_tools/travis/script.sh || travis_terminate 1 after_success: source build_tools/travis/after_success.sh || travis_terminate 1 notifications: webhooks: urls: - https://webhooks.gitter.im/e/4ffabb4df010b70cd624 on_success: change on_failure: always on_start: never ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct We are a community based on openness, as well as friendly and didactic discussions. We aspire to treat everybody equally, and value their contributions. Decisions are made based on technical merit and consensus. Code is not the only way to help the project. Reviewing pull requests, answering questions to help others on mailing lists or issues, organizing and teaching tutorials, working on the website, improving the documentation, are all priceless contributions. We abide by the principles of openness, respect, and consideration of others of the Python Software Foundation: https://www.python.org/psf/codeofconduct/ ================================================ FILE: CONTRIBUTING.md ================================================ Contributing to scikit-learn ============================ The latest contributing guide is available in the repository at `doc/developers/contributing.rst`, or online at: https://scikit-learn.org/dev/developers/contributing.html There are many ways to contribute to scikit-learn, with the most common ones being contribution of code or documentation to the project. Improving the documentation is no less important than improving the library itself. If you find a typo in the documentation, or have made improvements, do not hesitate to send an email to the mailing list or preferably submit a GitHub pull request. Documentation can be found under the [doc/](https://github.com/scikit-learn/scikit-learn/tree/main/doc) directory. But there are many other ways to help. In particular answering queries on the [issue tracker](https://github.com/scikit-learn/scikit-learn/issues), investigating bugs, and [reviewing other developers' pull requests](http://scikit-learn.org/dev/developers/contributing.html#code-review-guidelines) are very valuable contributions that decrease the burden on the project maintainers. Another way to contribute is to report issues you're facing, and give a "thumbs up" on issues that others reported and that are relevant to you. It also helps us if you spread the word: reference the project from your blog and articles, link to it from your website, or simply star it in GitHub to say "I use it". Quick links ----------- * [Submitting a bug report or feature request](http://scikit-learn.org/dev/developers/contributing.html#submitting-a-bug-report-or-a-feature-request) * [Contributing code](http://scikit-learn.org/dev/developers/contributing.html#contributing-code) * [Coding guidelines](https://scikit-learn.org/dev/developers/develop.html#coding-guidelines) * [Tips to read current code](https://scikit-learn.org/dev/developers/contributing.html#reading-the-existing-code-base) Code of Conduct --------------- We abide by the principles of openness, respect, and consideration of others of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. ================================================ FILE: COPYING ================================================ BSD 3-Clause License Copyright (c) 2007-2021 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: MANIFEST.in ================================================ include *.rst recursive-include doc * recursive-include examples * recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi *.tp recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz include COPYING include README.rst include pyproject.toml include sklearn/externals/README include sklearn/svm/src/liblinear/COPYRIGHT include sklearn/svm/src/libsvm/LIBSVM_CHANGES include conftest.py include Makefile include MANIFEST.in include .coveragerc # exclude from sdist recursive-exclude asv_benchmarks * recursive-exclude benchmarks * recursive-exclude build_tools * recursive-exclude maint_tools * recursive-exclude benchmarks * recursive-exclude .binder * recursive-exclude .circleci * exclude .codecov.yml exclude .git-blame-ignore-revs exclude .mailmap exclude .pre-commit-config.yaml exclude azure-pipelines.yml exclude lgtm.yml exclude CODE_OF_CONDUCT.md exclude CONTRIBUTING.md exclude PULL_REQUEST_TEMPLATE.md ================================================ FILE: Makefile ================================================ # simple makefile to simplify repetitive build env management tasks under posix # caution: testing won't work on windows, see README PYTHON ?= python CYTHON ?= cython PYTEST ?= pytest CTAGS ?= ctags # skip doctests on 32bit python BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))') all: clean inplace test clean-ctags: rm -f tags clean: clean-ctags $(PYTHON) setup.py clean rm -rf dist in: inplace # just a shortcut inplace: $(PYTHON) setup.py build_ext -i test-code: in $(PYTEST) --showlocals -v sklearn --durations=20 test-sphinxext: $(PYTEST) --showlocals -v doc/sphinxext/ test-doc: ifeq ($(BITS),64) $(PYTEST) $(shell find doc -name '*.rst' | sort) endif test-code-parallel: in $(PYTEST) -n auto --showlocals -v sklearn --durations=20 test-coverage: rm -rf coverage .coverage $(PYTEST) sklearn --showlocals -v --cov=sklearn --cov-report=html:coverage test-coverage-parallel: rm -rf coverage .coverage .coverage.* $(PYTEST) sklearn -n auto --showlocals -v --cov=sklearn --cov-report=html:coverage test: test-code test-sphinxext test-doc trailing-spaces: find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; cython: python setup.py build_src ctags: # make tags for symbol based navigation in emacs and vim # Install with: sudo apt-get install exuberant-ctags $(CTAGS) --python-kinds=-i -R sklearn doc: inplace $(MAKE) -C doc html doc-noplot: inplace $(MAKE) -C doc html-noplot code-analysis: flake8 sklearn | grep -v __init__ | grep -v external pylint -E -i y sklearn/ -d E1103,E0611,E1101 flake8-diff: git diff upstream/main -u -- "*.py" | flake8 --diff ================================================ FILE: README.rst ================================================ .. -*- mode: rst -*- |Azure|_ |Travis|_ |Codecov|_ |CircleCI|_ |Nightly wheels|_ |Black|_ |PythonVersion|_ |PyPi|_ |DOI|_ .. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=main .. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main .. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token .. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn .. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=main .. _Travis: https://app.travis-ci.com/github/scikit-learn/scikit-learn .. |Codecov| image:: https://codecov.io/gh/scikit-learn/scikit-learn/branch/main/graph/badge.svg?token=Pk8G9gg3y9 .. _Codecov: https://codecov.io/gh/scikit-learn/scikit-learn .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule .. |PythonVersion| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue .. _PythonVersion: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue .. |PyPi| image:: https://img.shields.io/pypi/v/scikit-learn .. _PyPi: https://pypi.org/project/scikit-learn .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg .. _Black: https://github.com/psf/black .. |DOI| image:: https://zenodo.org/badge/21369/scikit-learn/scikit-learn.svg .. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn .. |PythonMinVersion| replace:: 3.7 .. |NumPyMinVersion| replace:: 1.14.6 .. |SciPyMinVersion| replace:: 1.1.0 .. |JoblibMinVersion| replace:: 0.11 .. |ThreadpoolctlMinVersion| replace:: 2.0.0 .. |MatplotlibMinVersion| replace:: 2.2.3 .. |Scikit-ImageMinVersion| replace:: 0.14.5 .. |PandasMinVersion| replace:: 0.25.0 .. |SeabornMinVersion| replace:: 0.9.0 .. |PytestMinVersion| replace:: 5.0.1 .. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png :target: https://scikit-learn.org/ **scikit-learn** is a Python module for machine learning built on top of SciPy and is distributed under the 3-Clause BSD license. The project was started in 2007 by David Cournapeau as a Google Summer of Code project, and since then many volunteers have contributed. See the `About us `__ page for a list of core contributors. It is currently maintained by a team of volunteers. Website: https://scikit-learn.org Installation ------------ Dependencies ~~~~~~~~~~~~ scikit-learn requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) - SciPy (>= |SciPyMinVersion|) - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) ======= **Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** scikit-learn 0.23 and later require Python 3.6 or newer. scikit-learn 1.0 and later require Python 3.7 or newer. Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). For running the examples Matplotlib >= |MatplotlibMinVersion| is required. A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples require pandas >= |PandasMinVersion|, some examples require seaborn >= |SeabornMinVersion|. User installation ~~~~~~~~~~~~~~~~~ If you already have a working installation of numpy and scipy, the easiest way to install scikit-learn is using ``pip`` :: pip install -U scikit-learn or ``conda``:: conda install -c conda-forge scikit-learn The documentation includes more detailed `installation instructions `_. Changelog --------- See the `changelog `__ for a history of notable changes to scikit-learn. Development ----------- We welcome new contributors of all experience levels. The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. Important links ~~~~~~~~~~~~~~~ - Official source code repo: https://github.com/scikit-learn/scikit-learn - Download releases: https://pypi.org/project/scikit-learn/ - Issue tracker: https://github.com/scikit-learn/scikit-learn/issues Source code ~~~~~~~~~~~ You can check the latest sources with the command:: git clone https://github.com/scikit-learn/scikit-learn.git Contributing ~~~~~~~~~~~~ To learn more about making a contribution to scikit-learn, please see our `Contributing guide `_. Testing ~~~~~~~ After installation, you can launch the test suite from outside the source directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: pytest sklearn See the web page https://scikit-learn.org/dev/developers/advanced_installation.html#testing for more information. Random number generation can be controlled during testing by setting the ``SKLEARN_SEED`` environment variable. Submitting a Pull Request ~~~~~~~~~~~~~~~~~~~~~~~~~ Before opening a Pull Request, have a look at the full Contributing page to make sure your code complies with our guidelines: https://scikit-learn.org/stable/developers/index.html Project History --------------- The project was started in 2007 by David Cournapeau as a Google Summer of Code project, and since then many volunteers have contributed. See the `About us `__ page for a list of core contributors. The project is currently maintained by a team of volunteers. **Note**: `scikit-learn` was previously referred to as `scikits.learn`. Help and Support ---------------- Documentation ~~~~~~~~~~~~~ - HTML documentation (stable release): https://scikit-learn.org - HTML documentation (development version): https://scikit-learn.org/dev/ - FAQ: https://scikit-learn.org/stable/faq.html Communication ~~~~~~~~~~~~~ - Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn - Gitter: https://gitter.im/scikit-learn/scikit-learn - Twitter: https://twitter.com/scikit_learn - Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn - Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions - Website: https://scikit-learn.org - LinkedIn: https://www.linkedin.com/company/scikit-learn Citation ~~~~~~~~ If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions | Version | Supported | | --------- | ------------------ | | 1.0.1 | :white_check_mark: | | < 1.0.1 | :x: | ## Reporting a Vulnerability Please report security vulnerabilities by email to `security@scikit-learn.org`. This email is an alias to a subset of the scikit-learn maintainers' team. If the security vulnerability is accepted, a patch will be crafted privately in order to prepare a dedicated bugfix release as timely as possible (depending on the complexity of the fix). ================================================ FILE: asv_benchmarks/.gitignore ================================================ *__pycache__* env/ html/ results/ scikit-learn/ benchmarks/cache/ ================================================ FILE: asv_benchmarks/asv.conf.json ================================================ { // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "scikit-learn", // The project's homepage "project_url": "scikit-learn.org/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "..", // The Python project's subdirectory in your repo. If missing or // the empty string, the project is assumed to be located at the root // of the repository. // "repo_subdir": "", // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. // // "install_command": ["python -mpip install {wheel_file}"], // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], // "build_command": [ // "python setup.py build", // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" // ], // List of branches to benchmark. If not provided, defaults to "master // (for git) or "default" (for mercurial). "branches": ["main"], // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL // (if remote), or by looking for special directories, such as // ".git" (if local). // "dvcs": "git", // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", // timeout in seconds for installing any dependencies in environment // defaults to 10 min //"install_timeout": 600, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/scikit-learn/scikit-learn/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["3.6"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order // "conda_channels": ["conda-forge", "defaults"] // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty // list or empty string indicates to just test against the default // (latest) version. null indicates that the package is to not be // installed. If the package to be tested is only available from // PyPi, and the 'environment_type' is conda, then you can preface // the package name by 'pip+', and the package will be installed via // pip (with all the conda available packages installed first, // followed by the pip installed packages). // "matrix": { "numpy": [], "scipy": [], "cython": [], "joblib": [], "threadpoolctl": [] }, // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. // // An exclude entry excludes entries where all values match. The // values are regexps that should match the whole string. // // An include entry adds an environment. Only the packages listed // are installed. The 'python' key is required. The exclude rules // do not apply to includes. // // In addition to package names, the following keys are available: // // - python // Python version, as in the *pythons* variable above. // - environment_type // Environment type, as above. // - sys_platform // Platform, as in sys.platform. Possible values for the common // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows // {"environment_type": "conda", "six": null}, // don't run without six on conda // ], // // "include": [ // // additional env for python2.7 // {"python": "2.7", "numpy": "1.8"}, // // additional env if run on windows+conda // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, // ], // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" // "benchmark_dir": "benchmarks", // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" // "env_dir": "env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". // "results_dir": "results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". // "html_dir": "html", // The number of characters to retain in the commit hashes. // "hash_length": 8, // `asv` will cache results of the recent builds in each // environment, making them faster to install next time. This is // the number of builds to keep, per environment. // "build_cache_size": 2, // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are // regexps matching to benchmark names, and values corresponding to // the commit (exclusive) after which to start looking for // regressions. The default is to start from the first commit // with results. If the commit is `null`, regression detection is // skipped for the matching benchmark. // // "regressions_first_commits": { // "some_benchmark": "352cdf", // Consider regressions only after this commit // "another_benchmark": null, // Skip regression detection altogether // }, // The thresholds for relative change in results, after which `asv // publish` starts reporting regressions. Dictionary of the same // form as in ``regressions_first_commits``, with values // indicating the thresholds. If multiple entries match, the // maximum is taken. If no entry matches, the default is 5%. // // "regressions_thresholds": { // "some_benchmark": 0.01, // Threshold of 1% // "another_benchmark": 0.5, // Threshold of 50% // }, } ================================================ FILE: asv_benchmarks/benchmarks/__init__.py ================================================ """Benchmark suite for scikit-learn using ASV""" ================================================ FILE: asv_benchmarks/benchmarks/cluster.py ================================================ from sklearn.cluster import KMeans, MiniBatchKMeans from .common import Benchmark, Estimator, Predictor, Transformer from .datasets import _blobs_dataset, _20newsgroups_highdim_dataset from .utils import neg_mean_inertia class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): """ Benchmarks for KMeans. """ param_names = ["representation", "algorithm", "init"] params = (["dense", "sparse"], ["full", "elkan"], ["random", "k-means++"]) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, algorithm, init = params if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=8000) else: data = _blobs_dataset(n_clusters=20) return data def make_estimator(self, params): representation, algorithm, init = params max_iter = 30 if representation == "sparse" else 100 estimator = KMeans( n_clusters=20, algorithm=algorithm, init=init, n_init=1, max_iter=max_iter, tol=-1, random_state=0, ) return estimator def make_scorers(self): self.train_scorer = lambda _, __: neg_mean_inertia( self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_ ) self.test_scorer = lambda _, __: neg_mean_inertia( self.X_val, self.estimator.predict(self.X_val), self.estimator.cluster_centers_, ) class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): """ Benchmarks for MiniBatchKMeans. """ param_names = ["representation", "init"] params = (["dense", "sparse"], ["random", "k-means++"]) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, init = params if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _blobs_dataset(n_clusters=20) return data def make_estimator(self, params): representation, init = params max_iter = 5 if representation == "sparse" else 2 estimator = MiniBatchKMeans( n_clusters=20, init=init, n_init=1, max_iter=max_iter, batch_size=1000, max_no_improvement=None, compute_labels=False, random_state=0, ) return estimator def make_scorers(self): self.train_scorer = lambda _, __: neg_mean_inertia( self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_ ) self.test_scorer = lambda _, __: neg_mean_inertia( self.X_val, self.estimator.predict(self.X_val), self.estimator.cluster_centers_, ) ================================================ FILE: asv_benchmarks/benchmarks/common.py ================================================ import os import json import timeit import pickle import itertools from abc import ABC, abstractmethod from pathlib import Path from multiprocessing import cpu_count import numpy as np def get_from_config(): """Get benchmarks configuration from the config.json file""" current_path = Path(__file__).resolve().parent config_path = current_path / "config.json" with open(config_path, "r") as config_file: config_file = "".join(line for line in config_file if line and "//" not in line) config = json.loads(config_file) profile = os.getenv("SKLBENCH_PROFILE", config["profile"]) n_jobs_vals_env = os.getenv("SKLBENCH_NJOBS") if n_jobs_vals_env: n_jobs_vals = eval(n_jobs_vals_env) else: n_jobs_vals = config["n_jobs_vals"] if not n_jobs_vals: n_jobs_vals = list(range(1, 1 + cpu_count())) cache_path = current_path / "cache" cache_path.mkdir(exist_ok=True) (cache_path / "estimators").mkdir(exist_ok=True) (cache_path / "tmp").mkdir(exist_ok=True) save_estimators = os.getenv("SKLBENCH_SAVE_ESTIMATORS", config["save_estimators"]) save_dir = os.getenv("ASV_COMMIT", "new")[:8] if save_estimators: (cache_path / "estimators" / save_dir).mkdir(exist_ok=True) base_commit = os.getenv("SKLBENCH_BASE_COMMIT", config["base_commit"]) bench_predict = os.getenv("SKLBENCH_PREDICT", config["bench_predict"]) bench_transform = os.getenv("SKLBENCH_TRANSFORM", config["bench_transform"]) return ( profile, n_jobs_vals, save_estimators, save_dir, base_commit, bench_predict, bench_transform, ) def get_estimator_path(benchmark, directory, params, save=False): """Get path of pickled fitted estimator""" path = Path(__file__).resolve().parent / "cache" path = (path / "estimators" / directory) if save else (path / "tmp") filename = ( benchmark.__class__.__name__ + "_estimator_" + "_".join(list(map(str, params))) + ".pkl" ) return path / filename def clear_tmp(): """Clean the tmp directory""" path = Path(__file__).resolve().parent / "cache" / "tmp" for child in path.iterdir(): child.unlink() class Benchmark(ABC): """Abstract base class for all the benchmarks""" timer = timeit.default_timer # wall time processes = 1 timeout = 500 ( profile, n_jobs_vals, save_estimators, save_dir, base_commit, bench_predict, bench_transform, ) = get_from_config() if profile == "fast": warmup_time = 0 repeat = 1 number = 1 min_run_count = 1 data_size = "small" elif profile == "regular": warmup_time = 1 repeat = (3, 100, 30) data_size = "small" elif profile == "large_scale": warmup_time = 1 repeat = 3 number = 1 data_size = "large" @property @abstractmethod def params(self): pass class Estimator(ABC): """Abstract base class for all benchmarks of estimators""" @abstractmethod def make_data(self, params): """Return the dataset for a combination of parameters""" # The datasets are cached using joblib.Memory so it's fast and can be # called for each repeat pass @abstractmethod def make_estimator(self, params): """Return an instance of the estimator for a combination of parameters""" pass def skip(self, params): """Return True if the benchmark should be skipped for these params""" return False def setup_cache(self): """Pickle a fitted estimator for all combinations of parameters""" # This is run once per benchmark class. clear_tmp() param_grid = list(itertools.product(*self.params)) for params in param_grid: if self.skip(params): continue estimator = self.make_estimator(params) X, _, y, _ = self.make_data(params) estimator.fit(X, y) est_path = get_estimator_path( self, Benchmark.save_dir, params, Benchmark.save_estimators ) with est_path.open(mode="wb") as f: pickle.dump(estimator, f) def setup(self, *params): """Generate dataset and load the fitted estimator""" # This is run once per combination of parameters and per repeat so we # need to avoid doing expensive operations there. if self.skip(params): raise NotImplementedError self.X, self.X_val, self.y, self.y_val = self.make_data(params) est_path = get_estimator_path( self, Benchmark.save_dir, params, Benchmark.save_estimators ) with est_path.open(mode="rb") as f: self.estimator = pickle.load(f) self.make_scorers() def time_fit(self, *args): self.estimator.fit(self.X, self.y) def peakmem_fit(self, *args): self.estimator.fit(self.X, self.y) def track_train_score(self, *args): if hasattr(self.estimator, "predict"): y_pred = self.estimator.predict(self.X) else: y_pred = None return float(self.train_scorer(self.y, y_pred)) def track_test_score(self, *args): if hasattr(self.estimator, "predict"): y_val_pred = self.estimator.predict(self.X_val) else: y_val_pred = None return float(self.test_scorer(self.y_val, y_val_pred)) class Predictor(ABC): """Abstract base class for benchmarks of estimators implementing predict""" if Benchmark.bench_predict: def time_predict(self, *args): self.estimator.predict(self.X) def peakmem_predict(self, *args): self.estimator.predict(self.X) if Benchmark.base_commit is not None: def track_same_prediction(self, *args): est_path = get_estimator_path(self, Benchmark.base_commit, args, True) with est_path.open(mode="rb") as f: estimator_base = pickle.load(f) y_val_pred_base = estimator_base.predict(self.X_val) y_val_pred = self.estimator.predict(self.X_val) return np.allclose(y_val_pred_base, y_val_pred) @property @abstractmethod def params(self): pass class Transformer(ABC): """Abstract base class for benchmarks of estimators implementing transform""" if Benchmark.bench_transform: def time_transform(self, *args): self.estimator.transform(self.X) def peakmem_transform(self, *args): self.estimator.transform(self.X) if Benchmark.base_commit is not None: def track_same_transform(self, *args): est_path = get_estimator_path(self, Benchmark.base_commit, args, True) with est_path.open(mode="rb") as f: estimator_base = pickle.load(f) X_val_t_base = estimator_base.transform(self.X_val) X_val_t = self.estimator.transform(self.X_val) return np.allclose(X_val_t_base, X_val_t) @property @abstractmethod def params(self): pass ================================================ FILE: asv_benchmarks/benchmarks/config.json ================================================ { // "regular": Bencharks are run on small to medium datasets. Each benchmark // is run multiple times and averaged. // "fast": Benchmarks are run on small to medium datasets. Each benchmark // is run only once. May provide unstable benchmarks. // "large_scale": Benchmarks are run on large datasets. Each benchmark is // run multiple times and averaged. This profile is meant to // benchmark scalability and will take hours on single core. // Can be overridden by environment variable SKLBENCH_PROFILE. "profile": "regular", // List of values of n_jobs to use for estimators which accept this // parameter (-1 means all cores). An empty list means all values from 1 to // the maximum number of available cores. // Can be overridden by environment variable SKLBENCH_NJOBS. "n_jobs_vals": [1], // If true, fitted estimators are saved in ./cache/estimators/ // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS. "save_estimators": false, // Commit hash to compare estimator predictions with. // If null, predictions are not compared. // Can be overridden by environment variable SKLBENCH_BASE_COMMIT. "base_commit": null, // If false, the predict (resp. transform) method of the estimators won't // be benchmarked. // Can be overridden by environment variables SKLBENCH_PREDICT and // SKLBENCH_TRANSFORM. "bench_predict": true, "bench_transform": true } ================================================ FILE: asv_benchmarks/benchmarks/datasets.py ================================================ import numpy as np import scipy.sparse as sp from joblib import Memory from pathlib import Path from sklearn.decomposition import TruncatedSVD from sklearn.datasets import ( make_blobs, fetch_20newsgroups, fetch_openml, load_digits, make_regression, make_classification, fetch_olivetti_faces, ) from sklearn.preprocessing import MaxAbsScaler, StandardScaler from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split # memory location for caching datasets M = Memory(location=str(Path(__file__).resolve().parent / "cache")) @M.cache def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, dtype=np.float32): X, _ = make_blobs( n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=0 ) X = X.astype(dtype, copy=False) X, X_val = train_test_split(X, test_size=0.1, random_state=0) return X, X_val, None, None @M.cache def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), dtype=np.float32): newsgroups = fetch_20newsgroups(random_state=0) vectorizer = TfidfVectorizer(ngram_range=ngrams, dtype=dtype) X = vectorizer.fit_transform(newsgroups.data[:n_samples]) y = newsgroups.target[:n_samples] X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float32): newsgroups = fetch_20newsgroups() vectorizer = TfidfVectorizer(ngram_range=ngrams) X = vectorizer.fit_transform(newsgroups.data) X = X.astype(dtype, copy=False) svd = TruncatedSVD(n_components=n_components) X = svd.fit_transform(X) y = newsgroups.target X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _mnist_dataset(dtype=np.float32): X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) X = X.astype(dtype, copy=False) X = MaxAbsScaler().fit_transform(X) X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _digits_dataset(n_samples=None, dtype=np.float32): X, y = load_digits(return_X_y=True) X = X.astype(dtype, copy=False) X = MaxAbsScaler().fit_transform(X) X = X[:n_samples] y = y[:n_samples] X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _synth_regression_dataset(n_samples=100000, n_features=100, dtype=np.float32): X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features // 10, noise=50, random_state=0, ) X = X.astype(dtype, copy=False) X = StandardScaler().fit_transform(X) X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _synth_regression_sparse_dataset( n_samples=10000, n_features=10000, density=0.01, dtype=np.float32 ): X = sp.random( m=n_samples, n=n_features, density=density, format="csr", random_state=0 ) X.data = np.random.RandomState(0).randn(X.getnnz()) X = X.astype(dtype, copy=False) coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) coefs.data = np.random.RandomState(0).randn(coefs.getnnz()) y = X.dot(coefs.toarray()).reshape(-1) y += 0.2 * y.std() * np.random.randn(n_samples) X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _synth_classification_dataset( n_samples=1000, n_features=10000, n_classes=2, dtype=np.float32 ): X, y = make_classification( n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=0, n_informative=n_features, n_redundant=0, ) X = X.astype(dtype, copy=False) X = StandardScaler().fit_transform(X) X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) return X, X_val, y, y_val @M.cache def _olivetti_faces_dataset(): dataset = fetch_olivetti_faces(shuffle=True, random_state=42) faces = dataset.data n_samples, n_features = faces.shape faces_centered = faces - faces.mean(axis=0) # local centering faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) X = faces_centered X, X_val = train_test_split(X, test_size=0.1, random_state=0) return X, X_val, None, None @M.cache def _random_dataset( n_samples=1000, n_features=1000, representation="dense", dtype=np.float32 ): if representation == "dense": X = np.random.RandomState(0).random_sample((n_samples, n_features)) X = X.astype(dtype, copy=False) else: X = sp.random( n_samples, n_features, density=0.05, format="csr", dtype=dtype, random_state=0, ) X, X_val = train_test_split(X, test_size=0.1, random_state=0) return X, X_val, None, None ================================================ FILE: asv_benchmarks/benchmarks/decomposition.py ================================================ from sklearn.decomposition import PCA, DictionaryLearning, MiniBatchDictionaryLearning from .common import Benchmark, Estimator, Transformer from .datasets import _olivetti_faces_dataset, _mnist_dataset from .utils import make_pca_scorers, make_dict_learning_scorers class PCABenchmark(Transformer, Estimator, Benchmark): """ Benchmarks for PCA. """ param_names = ["svd_solver"] params = (["full", "arpack", "randomized"],) def setup_cache(self): super().setup_cache() def make_data(self, params): return _mnist_dataset() def make_estimator(self, params): (svd_solver,) = params estimator = PCA(n_components=32, svd_solver=svd_solver, random_state=0) return estimator def make_scorers(self): make_pca_scorers(self) class DictionaryLearningBenchmark(Transformer, Estimator, Benchmark): """ Benchmarks for DictionaryLearning. """ param_names = ["fit_algorithm", "n_jobs"] params = (["lars", "cd"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() def make_data(self, params): return _olivetti_faces_dataset() def make_estimator(self, params): fit_algorithm, n_jobs = params estimator = DictionaryLearning( n_components=15, fit_algorithm=fit_algorithm, alpha=0.1, max_iter=20, tol=1e-16, random_state=0, n_jobs=n_jobs, ) return estimator def make_scorers(self): make_dict_learning_scorers(self) class MiniBatchDictionaryLearningBenchmark(Transformer, Estimator, Benchmark): """ Benchmarks for MiniBatchDictionaryLearning """ param_names = ["fit_algorithm", "n_jobs"] params = (["lars", "cd"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() def make_data(self, params): return _olivetti_faces_dataset() def make_estimator(self, params): fit_algorithm, n_jobs = params estimator = MiniBatchDictionaryLearning( n_components=15, fit_algorithm=fit_algorithm, alpha=0.1, batch_size=3, random_state=0, n_jobs=n_jobs, ) return estimator def make_scorers(self): make_dict_learning_scorers(self) ================================================ FILE: asv_benchmarks/benchmarks/ensemble.py ================================================ from sklearn.ensemble import ( RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ) from .common import Benchmark, Estimator, Predictor from .datasets import ( _20newsgroups_highdim_dataset, _20newsgroups_lowdim_dataset, _synth_classification_dataset, ) from .utils import make_gen_classif_scorers class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for RandomForestClassifier. """ param_names = ["representation", "n_jobs"] params = (["dense", "sparse"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, n_jobs = params if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _20newsgroups_lowdim_dataset() return data def make_estimator(self, params): representation, n_jobs = params n_estimators = 500 if Benchmark.data_size == "large" else 100 estimator = RandomForestClassifier( n_estimators=n_estimators, min_samples_split=10, max_features="log2", n_jobs=n_jobs, random_state=0, ) return estimator def make_scorers(self): make_gen_classif_scorers(self) class GradientBoostingClassifierBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for GradientBoostingClassifier. """ param_names = ["representation"] params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): (representation,) = params if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _20newsgroups_lowdim_dataset() return data def make_estimator(self, params): (representation,) = params n_estimators = 100 if Benchmark.data_size == "large" else 10 estimator = GradientBoostingClassifier( n_estimators=n_estimators, max_features="log2", subsample=0.5, random_state=0, ) return estimator def make_scorers(self): make_gen_classif_scorers(self) class HistGradientBoostingClassifierBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for HistGradientBoostingClassifier. """ param_names = [] params = () def setup_cache(self): super().setup_cache() def make_data(self, params): data = _synth_classification_dataset( n_samples=10000, n_features=100, n_classes=5 ) return data def make_estimator(self, params): estimator = HistGradientBoostingClassifier( max_iter=100, max_leaf_nodes=15, early_stopping=False, random_state=0 ) return estimator def make_scorers(self): make_gen_classif_scorers(self) ================================================ FILE: asv_benchmarks/benchmarks/linear_model.py ================================================ from sklearn.linear_model import ( LogisticRegression, Ridge, ElasticNet, Lasso, LinearRegression, SGDRegressor, ) from .common import Benchmark, Estimator, Predictor from .datasets import ( _20newsgroups_highdim_dataset, _20newsgroups_lowdim_dataset, _synth_regression_dataset, _synth_regression_sparse_dataset, ) from .utils import make_gen_classif_scorers, make_gen_reg_scorers class LogisticRegressionBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for LogisticRegression. """ param_names = ["representation", "solver", "n_jobs"] params = (["dense", "sparse"], ["lbfgs", "saga"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, solver, n_jobs = params if Benchmark.data_size == "large": if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=10000) else: data = _20newsgroups_lowdim_dataset(n_components=1e3) else: if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=2500) else: data = _20newsgroups_lowdim_dataset() return data def make_estimator(self, params): representation, solver, n_jobs = params penalty = "l2" if solver == "lbfgs" else "l1" estimator = LogisticRegression( solver=solver, penalty=penalty, multi_class="multinomial", tol=0.01, n_jobs=n_jobs, random_state=0, ) return estimator def make_scorers(self): make_gen_classif_scorers(self) class RidgeBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for Ridge. """ param_names = ["representation", "solver"] params = ( ["dense", "sparse"], ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"], ) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, solver = params if representation == "dense": data = _synth_regression_dataset(n_samples=500000, n_features=100) else: data = _synth_regression_sparse_dataset( n_samples=100000, n_features=10000, density=0.005 ) return data def make_estimator(self, params): representation, solver = params estimator = Ridge(solver=solver, fit_intercept=False, random_state=0) return estimator def make_scorers(self): make_gen_reg_scorers(self) def skip(self, params): representation, solver = params if representation == "sparse" and solver == "svd": return True return False class LinearRegressionBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for Linear Reagression. """ param_names = ["representation"] params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): (representation,) = params if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: data = _synth_regression_sparse_dataset( n_samples=10000, n_features=100000, density=0.01 ) return data def make_estimator(self, params): estimator = LinearRegression() return estimator def make_scorers(self): make_gen_reg_scorers(self) class SGDRegressorBenchmark(Predictor, Estimator, Benchmark): """ Benchmark for SGD """ param_names = ["representation"] params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): (representation,) = params if representation == "dense": data = _synth_regression_dataset(n_samples=100000, n_features=200) else: data = _synth_regression_sparse_dataset( n_samples=100000, n_features=1000, density=0.01 ) return data def make_estimator(self, params): estimator = SGDRegressor(max_iter=1000, tol=1e-16, random_state=0) return estimator def make_scorers(self): make_gen_reg_scorers(self) class ElasticNetBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for ElasticNet. """ param_names = ["representation", "precompute"] params = (["dense", "sparse"], [True, False]) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, precompute = params if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: data = _synth_regression_sparse_dataset( n_samples=50000, n_features=5000, density=0.01 ) return data def make_estimator(self, params): representation, precompute = params estimator = ElasticNet(precompute=precompute, alpha=0.001, random_state=0) return estimator def make_scorers(self): make_gen_reg_scorers(self) def skip(self, params): representation, precompute = params if representation == "sparse" and precompute is False: return True return False class LassoBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for Lasso. """ param_names = ["representation", "precompute"] params = (["dense", "sparse"], [True, False]) def setup_cache(self): super().setup_cache() def make_data(self, params): representation, precompute = params if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: data = _synth_regression_sparse_dataset( n_samples=50000, n_features=5000, density=0.01 ) return data def make_estimator(self, params): representation, precompute = params estimator = Lasso(precompute=precompute, alpha=0.001, random_state=0) return estimator def make_scorers(self): make_gen_reg_scorers(self) def skip(self, params): representation, precompute = params if representation == "sparse" and precompute is False: return True return False ================================================ FILE: asv_benchmarks/benchmarks/manifold.py ================================================ from sklearn.manifold import TSNE from .common import Benchmark, Estimator from .datasets import _digits_dataset class TSNEBenchmark(Estimator, Benchmark): """ Benchmarks for t-SNE. """ param_names = ["method"] params = (["exact", "barnes_hut"],) def setup_cache(self): super().setup_cache() def make_data(self, params): (method,) = params n_samples = 500 if method == "exact" else None return _digits_dataset(n_samples=n_samples) def make_estimator(self, params): (method,) = params estimator = TSNE(random_state=0, method=method) return estimator def make_scorers(self): self.train_scorer = lambda _, __: self.estimator.kl_divergence_ self.test_scorer = lambda _, __: self.estimator.kl_divergence_ ================================================ FILE: asv_benchmarks/benchmarks/metrics.py ================================================ from sklearn.metrics.pairwise import pairwise_distances from .common import Benchmark from .datasets import _random_dataset class PairwiseDistancesBenchmark(Benchmark): """ Benchmarks for pairwise distances. """ param_names = ["representation", "metric", "n_jobs"] params = ( ["dense", "sparse"], ["cosine", "euclidean", "manhattan", "correlation"], Benchmark.n_jobs_vals, ) def setup(self, *params): representation, metric, n_jobs = params if representation == "sparse" and metric == "correlation": raise NotImplementedError if Benchmark.data_size == "large": if metric in ("manhattan", "correlation"): n_samples = 8000 else: n_samples = 24000 else: if metric in ("manhattan", "correlation"): n_samples = 4000 else: n_samples = 12000 data = _random_dataset(n_samples=n_samples, representation=representation) self.X, self.X_val, self.y, self.y_val = data self.pdist_params = {"metric": metric, "n_jobs": n_jobs} def time_pairwise_distances(self, *args): pairwise_distances(self.X, **self.pdist_params) def peakmem_pairwise_distances(self, *args): pairwise_distances(self.X, **self.pdist_params) ================================================ FILE: asv_benchmarks/benchmarks/model_selection.py ================================================ from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV, cross_val_score from .common import Benchmark, Estimator, Predictor from .datasets import _synth_classification_dataset from .utils import make_gen_classif_scorers class CrossValidationBenchmark(Benchmark): """ Benchmarks for Cross Validation. """ timeout = 20000 param_names = ["n_jobs"] params = (Benchmark.n_jobs_vals,) def setup(self, *params): (n_jobs,) = params data = _synth_classification_dataset(n_samples=50000, n_features=100) self.X, self.X_val, self.y, self.y_val = data self.clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0) cv = 16 if Benchmark.data_size == "large" else 4 self.cv_params = {"n_jobs": n_jobs, "cv": cv} def time_crossval(self, *args): cross_val_score(self.clf, self.X, self.y, **self.cv_params) def peakmem_crossval(self, *args): cross_val_score(self.clf, self.X, self.y, **self.cv_params) def track_crossval(self, *args): return float(cross_val_score(self.clf, self.X, self.y, **self.cv_params).mean()) class GridSearchBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for GridSearch. """ timeout = 20000 param_names = ["n_jobs"] params = (Benchmark.n_jobs_vals,) def setup_cache(self): super().setup_cache() def make_data(self, params): data = _synth_classification_dataset(n_samples=10000, n_features=100) return data def make_estimator(self, params): (n_jobs,) = params clf = RandomForestClassifier(random_state=0) if Benchmark.data_size == "large": n_estimators_list = [10, 25, 50, 100, 500] max_depth_list = [5, 10, None] max_features_list = [0.1, 0.4, 0.8, 1.0] else: n_estimators_list = [10, 25, 50] max_depth_list = [5, 10] max_features_list = [0.1, 0.4, 0.8] param_grid = { "n_estimators": n_estimators_list, "max_depth": max_depth_list, "max_features": max_features_list, } estimator = GridSearchCV(clf, param_grid, n_jobs=n_jobs, cv=4) return estimator def make_scorers(self): make_gen_classif_scorers(self) ================================================ FILE: asv_benchmarks/benchmarks/neighbors.py ================================================ from sklearn.neighbors import KNeighborsClassifier from .common import Benchmark, Estimator, Predictor from .datasets import _20newsgroups_lowdim_dataset from .utils import make_gen_classif_scorers class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark): """ Benchmarks for KNeighborsClassifier. """ param_names = ["algorithm", "dimension", "n_jobs"] params = (["brute", "kd_tree", "ball_tree"], ["low", "high"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() def make_data(self, params): algorithm, dimension, n_jobs = params if Benchmark.data_size == "large": n_components = 40 if dimension == "low" else 200 else: n_components = 10 if dimension == "low" else 50 data = _20newsgroups_lowdim_dataset(n_components=n_components) return data def make_estimator(self, params): algorithm, dimension, n_jobs = params estimator = KNeighborsClassifier(algorithm=algorithm, n_jobs=n_jobs) return estimator def make_scorers(self): make_gen_classif_scorers(self) ================================================ FILE: asv_benchmarks/benchmarks/svm.py ================================================ from sklearn.svm import SVC from .common import Benchmark, Estimator, Predictor from .datasets import _synth_classification_dataset from .utils import make_gen_classif_scorers class SVCBenchmark(Predictor, Estimator, Benchmark): """Benchmarks for SVC.""" param_names = ["kernel"] params = (["linear", "poly", "rbf", "sigmoid"],) def setup_cache(self): super().setup_cache() def make_data(self, params): return _synth_classification_dataset() def make_estimator(self, params): (kernel,) = params estimator = SVC( max_iter=100, tol=1e-16, kernel=kernel, random_state=0, gamma="scale" ) return estimator def make_scorers(self): make_gen_classif_scorers(self) ================================================ FILE: asv_benchmarks/benchmarks/utils.py ================================================ import numpy as np from sklearn.metrics import balanced_accuracy_score, r2_score def neg_mean_inertia(X, labels, centers): return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean() def make_gen_classif_scorers(caller): caller.train_scorer = balanced_accuracy_score caller.test_scorer = balanced_accuracy_score def make_gen_reg_scorers(caller): caller.test_scorer = r2_score caller.train_scorer = r2_score def neg_mean_data_error(X, U, V): return -np.sqrt(((X - U.dot(V)) ** 2).mean()) def make_dict_learning_scorers(caller): caller.train_scorer = lambda _, __: ( neg_mean_data_error( caller.X, caller.estimator.transform(caller.X), caller.estimator.components_ ) ) caller.test_scorer = lambda _, __: ( neg_mean_data_error( caller.X_val, caller.estimator.transform(caller.X_val), caller.estimator.components_, ) ) def explained_variance_ratio(Xt, X): return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum() def make_pca_scorers(caller): caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum() caller.test_scorer = lambda _, __: ( explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val) ) ================================================ FILE: azure-pipelines.yml ================================================ # Adapted from https://github.com/pandas-dev/pandas/blob/master/azure-pipelines.yml schedules: - cron: "30 2 * * *" displayName: Run nightly build branches: include: - main always: true jobs: - job: git_commit displayName: Get Git Commit pool: vmImage: ubuntu-20.04 steps: - bash: | set -ex if [[ $BUILD_REASON == "PullRequest" ]]; then # By default pull requests use refs/pull/PULL_ID/merge as the source branch # which has a "Merge ID into ID" as a commit message. The latest commit # message is the second to last commit COMMIT_ID=$(echo $BUILD_SOURCEVERSIONMESSAGE | awk '{print $2}') message=$(git log $COMMIT_ID -1 --pretty=%B) else message=$BUILD_SOURCEVERSIONMESSAGE fi echo "##vso[task.setvariable variable=message;isOutput=true]$message" name: commit displayName: Get source version message - job: linting dependsOn: [git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[lint skip]')), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) displayName: Linting pool: vmImage: ubuntu-20.04 steps: - task: UsePythonVersion@0 inputs: versionSpec: '3.9' - bash: | # Include pytest compatibility with mypy pip install pytest flake8 mypy==0.782 black==21.6b0 displayName: Install linters - bash: | black --check --diff . displayName: Run black - bash: | ./build_tools/circle/linting.sh displayName: Run linting - bash: | mypy sklearn/ displayName: Run mypy - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly vmImage: ubuntu-20.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), or(eq(variables['Build.Reason'], 'Schedule'), contains(dependencies['git_commit']['outputs']['commit.message'], '[scipy-dev]' ) ) ) matrix: pylatest_pip_scipy_dev: DISTRIB: 'conda-pip-scipy-dev' PYTHON_VERSION: '*' CHECK_WARNINGS: 'true' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' TEST_DOCSTRINGS: 'true' # Tests that require large downloads over the networks are skipped in CI. # Here we make sure, that they are still run on a regular basis. SKLEARN_SKIP_NETWORK_TESTS: '0' CREATE_ISSUE_ON_TRACKER: 'true' # Check compilation with intel C++ compiler (ICC) - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly_ICC vmImage: ubuntu-20.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), or(eq(variables['Build.Reason'], 'Schedule'), contains(dependencies['git_commit']['outputs']['commit.message'], '[icc-build]') ) ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '*' BLAS: 'mkl' COVERAGE: 'false' BUILD_WITH_ICC: 'true' - template: build_tools/azure/posix-docker.yml parameters: name: Linux_Nightly_PyPy vmImage: ubuntu-20.04 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), or( eq(variables['Build.Reason'], 'Schedule'), contains(dependencies['git_commit']['outputs']['commit.message'], '[pypy]') ) ) matrix: pypy3: DISTRIB: 'conda-mamba-pypy3' DOCKER_CONTAINER: 'condaforge/mambaforge-pypy3:4.10.3-5' PILLOW_VERSION: 'none' PANDAS_VERSION: 'none' CREATE_ISSUE_ON_TRACKER: 'true' # Will run all the time regardless of linting outcome. - template: build_tools/azure/posix.yml parameters: name: Linux_Runs vmImage: ubuntu-20.04 dependsOn: [git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '*' BLAS: 'mkl' COVERAGE: 'true' SHOW_SHORT_SUMMARY: 'true' # Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge - template: build_tools/azure/posix.yml parameters: name: Ubuntu_Bionic vmImage: ubuntu-18.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: py37_conda_forge_openblas_ubuntu_1804: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.7' BLAS: 'openblas' COVERAGE: 'false' BUILD_WITH_ICC: 'false' - template: build_tools/azure/posix.yml parameters: name: Linux vmImage: ubuntu-20.04 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Focal 20.04 # i.e. numpy 1.17.4 and scipy 1.3.3 ubuntu_atlas: DISTRIB: 'ubuntu' JOBLIB_VERSION: 'min' PANDAS_VERSION: 'none' THREADPOOLCTL_VERSION: 'min' COVERAGE: 'false' # Linux + Python 3.7 build with OpenBLAS and without SITE_JOBLIB py37_conda_defaults_openblas: DISTRIB: 'conda' CONDA_CHANNEL: 'defaults' # Anaconda main channel PYTHON_VERSION: '3.7' BLAS: 'openblas' NUMPY_VERSION: 'min' SCIPY_VERSION: 'min' MATPLOTLIB_VERSION: 'min' THREADPOOLCTL_VERSION: '2.2.0' # Linux environment to test the latest available dependencies and MKL. # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '3.9' PANDAS_VERSION: 'none' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' TEST_DOCSTRINGS: 'true' CHECK_WARNINGS: 'true' - template: build_tools/azure/posix-docker.yml parameters: name: Linux_Docker vmImage: ubuntu-20.04 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: debian_atlas_32bit: DISTRIB: 'debian-32' DOCKER_CONTAINER: 'i386/debian:10.9' JOBLIB_VERSION: 'min' # disable pytest xdist due to unknown bug with 32-bit container PYTEST_XDIST_VERSION: 'none' PYTEST_VERSION: 'min' THREADPOOLCTL_VERSION: '2.2.0' - template: build_tools/azure/posix.yml parameters: name: macOS vmImage: macOS-10.14 dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' BLAS: 'mkl' CONDA_CHANNEL: 'conda-forge' pylatest_conda_mkl_no_openmp: DISTRIB: 'conda' BLAS: 'mkl' SKLEARN_TEST_NO_OPENMP: 'true' SKLEARN_SKIP_OPENMP_TEST: 'true' - template: build_tools/azure/windows.yml parameters: name: Windows vmImage: windows-latest dependsOn: [linting, git_commit] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), ne(variables['Build.Reason'], 'Schedule') ) matrix: py37_conda_forge_mkl: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.7' CHECK_WARNINGS: 'true' PYTHON_ARCH: '64' PYTEST_VERSION: '*' COVERAGE: 'true' py37_pip_openblas_32bit: PYTHON_VERSION: '3.7' PYTHON_ARCH: '32' ================================================ FILE: benchmarks/.gitignore ================================================ /bhtsne *.npy *.json /mnist_tsne_output/ ================================================ FILE: benchmarks/bench_20newsgroups.py ================================================ from time import time import argparse import numpy as np from sklearn.dummy import DummyClassifier from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.metrics import accuracy_score from sklearn.utils.validation import check_array from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB ESTIMATORS = { "dummy": DummyClassifier(), "random_forest": RandomForestClassifier(max_features="sqrt", min_samples_split=10), "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10), "logistic_regression": LogisticRegression(), "naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), } ############################################################################### # Data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-e", "--estimators", nargs="+", required=True, choices=ESTIMATORS ) args = vars(parser.parse_args()) data_train = fetch_20newsgroups_vectorized(subset="train") data_test = fetch_20newsgroups_vectorized(subset="test") X_train = check_array(data_train.data, dtype=np.float32, accept_sparse="csc") X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") y_train = data_train.target y_test = data_test.target print("20 newsgroups") print("=============") print(f"X_train.shape = {X_train.shape}") print(f"X_train.format = {X_train.format}") print(f"X_train.dtype = {X_train.dtype}") print(f"X_train density = {X_train.nnz / np.product(X_train.shape)}") print(f"y_train {y_train.shape}") print(f"X_test {X_test.shape}") print(f"X_test.format = {X_test.format}") print(f"X_test.dtype = {X_test.dtype}") print(f"y_test {y_test.shape}") print() print("Classifier Training") print("===================") accuracy, train_time, test_time = {}, {}, {} for name in sorted(args["estimators"]): clf = ESTIMATORS[name] try: clf.set_params(random_state=0) except (TypeError, ValueError): pass print("Training %s ... " % name, end="") t0 = time() clf.fit(X_train, y_train) train_time[name] = time() - t0 t0 = time() y_pred = clf.predict(X_test) test_time[name] = time() - t0 accuracy[name] = accuracy_score(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print() print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "Accuracy")) print("-" * 44) for name in sorted(accuracy, key=accuracy.get): print( "%s %s %s %s" % ( name.ljust(16), ("%.4fs" % train_time[name]).center(10), ("%.4fs" % test_time[name]).center(10), ("%.4f" % accuracy[name]).center(10), ) ) print() ================================================ FILE: benchmarks/bench_covertype.py ================================================ """ =========================== Covertype dataset benchmark =========================== Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes, CART (decision tree), RandomForest and Extra-Trees on the forest covertype dataset of Blackard, Jock, and Dean [1]. The dataset comprises 581,012 samples. It is low dimensional with 54 features and a sparsity of approx. 23%. Here, we consider the task of predicting class 1 (spruce/fir). The classification performance of SGD is competitive with Liblinear while being two orders of magnitude faster to train:: [..] Classification performance: =========================== Classifier train-time test-time error-rate -------------------------------------------- liblinear 15.9744s 0.0705s 0.2305 GaussianNB 3.0666s 0.3884s 0.4841 SGD 1.0558s 0.1152s 0.2300 CART 79.4296s 0.0523s 0.0469 RandomForest 1190.1620s 0.5881s 0.0243 ExtraTrees 640.3194s 0.6495s 0.0198 The same task has been used in a number of papers including: * `"SVM Optimization: Inverse Dependence on Training Set Size" `_ S. Shalev-Shwartz, N. Srebro - In Proceedings of ICML '08. * `"Pegasos: Primal estimated sub-gradient solver for svm" `_ S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. * `"Training Linear SVMs in Linear Time" `_ T. Joachims - In SIGKDD '06 [1] https://archive.ics.uci.edu/ml/datasets/Covertype """ # Author: Peter Prettenhofer # Arnaud Joly # License: BSD 3 clause import os from time import time import argparse import numpy as np from joblib import Memory from sklearn.datasets import fetch_covtype, get_data_home from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import zero_one_loss from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory( os.path.join(get_data_home(), "covertype_benchmark_data"), mmap_mode="r" ) @memory.cache def load_data(dtype=np.float32, order="C", random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype( download_if_missing=True, shuffle=True, random_state=random_state ) X = check_array(data["data"], dtype=dtype, order=order) y = (data["target"] != 1).astype(int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 522911 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test ESTIMATORS = { "GBRT": GradientBoostingClassifier(n_estimators=250), "ExtraTrees": ExtraTreesClassifier(n_estimators=20), "RandomForest": RandomForestClassifier(n_estimators=20), "CART": DecisionTreeClassifier(min_samples_split=5), "SGD": SGDClassifier(alpha=0.001), "GaussianNB": GaussianNB(), "liblinear": LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, tol=1e-3), "SAG": LogisticRegression(solver="sag", max_iter=2, C=1000), } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--classifiers", nargs="+", choices=ESTIMATORS, type=str, default=["liblinear", "GaussianNB", "SGD", "CART"], help="list of classifiers to benchmark.", ) parser.add_argument( "--n-jobs", nargs="?", default=1, type=int, help=( "Number of concurrently running workers for " "models that support parallelism." ), ) parser.add_argument( "--order", nargs="?", default="C", type=str, choices=["F", "C"], help="Allow to choose between fortran and C ordered data", ) parser.add_argument( "--random-seed", nargs="?", default=13, type=int, help="Common seed used by random number generator.", ) args = vars(parser.parse_args()) print(__doc__) X_train, X_test, y_train, y_test = load_data( order=args["order"], random_state=args["random_seed"] ) print("") print("Dataset statistics:") print("===================") print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) print( "%s %d (pos=%d, neg=%d, size=%dMB)" % ( "number of train samples:".ljust(25), X_train.shape[0], np.sum(y_train == 1), np.sum(y_train == 0), int(X_train.nbytes / 1e6), ) ) print( "%s %d (pos=%d, neg=%d, size=%dMB)" % ( "number of test samples:".ljust(25), X_test.shape[0], np.sum(y_test == 1), np.sum(y_test == 0), int(X_test.nbytes / 1e6), ) ) print() print("Training Classifiers") print("====================") error, train_time, test_time = {}, {}, {} for name in sorted(args["classifiers"]): print("Training %s ... " % name, end="") estimator = ESTIMATORS[name] estimator_params = estimator.get_params() estimator.set_params( **{ p: args["random_seed"] for p in estimator_params if p.endswith("random_state") } ) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=args["n_jobs"]) time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "error-rate")) print("-" * 44) for name in sorted(args["classifiers"], key=error.get): print( "%s %s %s %s" % ( name.ljust(12), ("%.4fs" % train_time[name]).center(10), ("%.4fs" % test_time[name]).center(10), ("%.4f" % error[name]).center(10), ) ) print() ================================================ FILE: benchmarks/bench_feature_expansions.py ================================================ import matplotlib.pyplot as plt import numpy as np import scipy.sparse as sparse from sklearn.preprocessing import PolynomialFeatures from time import time degree = 2 trials = 3 num_rows = 1000 dimensionalities = np.array([1, 2, 8, 16, 32, 64]) densities = np.array([0.01, 0.1, 1.0]) csr_times = {d: np.zeros(len(dimensionalities)) for d in densities} dense_times = {d: np.zeros(len(dimensionalities)) for d in densities} transform = PolynomialFeatures( degree=degree, include_bias=False, interaction_only=False ) for trial in range(trials): for density in densities: for dim_index, dim in enumerate(dimensionalities): print(trial, density, dim) X_csr = sparse.random(num_rows, dim, density).tocsr() X_dense = X_csr.toarray() # CSR t0 = time() transform.fit_transform(X_csr) csr_times[density][dim_index] += time() - t0 # Dense t0 = time() transform.fit_transform(X_dense) dense_times[density][dim_index] += time() - t0 csr_linestyle = (0, (3, 1, 1, 1, 1, 1)) # densely dashdotdotted dense_linestyle = (0, ()) # solid fig, axes = plt.subplots(nrows=len(densities), ncols=1, figsize=(8, 10)) for density, ax in zip(densities, axes): ax.plot( dimensionalities, csr_times[density] / trials, label="csr", linestyle=csr_linestyle, ) ax.plot( dimensionalities, dense_times[density] / trials, label="dense", linestyle=dense_linestyle, ) ax.set_title("density %0.2f, degree=%d, n_samples=%d" % (density, degree, num_rows)) ax.legend() ax.set_xlabel("Dimensionality") ax.set_ylabel("Time (seconds)") plt.tight_layout() plt.show() ================================================ FILE: benchmarks/bench_glm.py ================================================ """ A comparison of different methods in GLM Data comes from a random square matrix. """ from datetime import datetime import numpy as np from sklearn import linear_model if __name__ == "__main__": import matplotlib.pyplot as plt n_iter = 40 time_ridge = np.empty(n_iter) time_ols = np.empty(n_iter) time_lasso = np.empty(n_iter) dimensions = 500 * np.arange(1, n_iter + 1) for i in range(n_iter): print("Iteration %s of %s" % (i, n_iter)) n_samples, n_features = 10 * i + 3, 10 * i + 3 X = np.random.randn(n_samples, n_features) Y = np.random.randn(n_samples) start = datetime.now() ridge = linear_model.Ridge(alpha=1.0) ridge.fit(X, Y) time_ridge[i] = (datetime.now() - start).total_seconds() start = datetime.now() ols = linear_model.LinearRegression() ols.fit(X, Y) time_ols[i] = (datetime.now() - start).total_seconds() start = datetime.now() lasso = linear_model.LassoLars() lasso.fit(X, Y) time_lasso[i] = (datetime.now() - start).total_seconds() plt.figure("scikit-learn GLM benchmark results") plt.xlabel("Dimensions") plt.ylabel("Time (s)") plt.plot(dimensions, time_ridge, color="r") plt.plot(dimensions, time_ols, color="g") plt.plot(dimensions, time_lasso, color="b") plt.legend(["Ridge", "OLS", "LassoLars"], loc="upper left") plt.axis("tight") plt.show() ================================================ FILE: benchmarks/bench_glmnet.py ================================================ """ To run this, you'll need to have installed. * glmnet-python * scikit-learn (of course) Does two benchmarks First, we fix a training set and increase the number of samples. Then we plot the computation time as function of the number of samples. In the second benchmark, we increase the number of dimensions of the training set. Then we plot the computation time as function of the number of dimensions. In both cases, only 10% of the features are informative. """ import numpy as np import gc from time import time from sklearn.datasets import make_regression alpha = 0.1 # alpha = 0.01 def rmse(a, b): return np.sqrt(np.mean((a - b) ** 2)) def bench(factory, X, Y, X_test, Y_test, ref_coef): gc.collect() # start time tstart = time() clf = factory(alpha=alpha).fit(X, Y) delta = time() - tstart # stop time print("duration: %0.3fs" % delta) print("rmse: %f" % rmse(Y_test, clf.predict(X_test))) print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean()) return delta if __name__ == "__main__": from glmnet.elastic_net import Lasso as GlmnetLasso from sklearn.linear_model import Lasso as ScikitLasso # Delayed import of matplotlib.pyplot import matplotlib.pyplot as plt scikit_results = [] glmnet_results = [] n = 20 step = 500 n_features = 1000 n_informative = n_features / 10 n_test_samples = 1000 for i in range(1, n + 1): print("==================") print("Iteration %s of %s" % (i, n)) print("==================") X, Y, coef_ = make_regression( n_samples=(i * step) + n_test_samples, n_features=n_features, noise=0.1, n_informative=n_informative, coef=True, ) X_test = X[-n_test_samples:] Y_test = Y[-n_test_samples:] X = X[: (i * step)] Y = Y[: (i * step)] print("benchmarking scikit-learn: ") scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) print("benchmarking glmnet: ") glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) plt.clf() xx = range(0, n * step, step) plt.title("Lasso regression on sample dataset (%d features)" % n_features) plt.plot(xx, scikit_results, "b-", label="scikit-learn") plt.plot(xx, glmnet_results, "r-", label="glmnet") plt.legend() plt.xlabel("number of samples to classify") plt.ylabel("Time (s)") plt.show() # now do a benchmark where the number of points is fixed # and the variable is the number of features scikit_results = [] glmnet_results = [] n = 20 step = 100 n_samples = 500 for i in range(1, n + 1): print("==================") print("Iteration %02d of %02d" % (i, n)) print("==================") n_features = i * step n_informative = n_features / 10 X, Y, coef_ = make_regression( n_samples=(i * step) + n_test_samples, n_features=n_features, noise=0.1, n_informative=n_informative, coef=True, ) X_test = X[-n_test_samples:] Y_test = Y[-n_test_samples:] X = X[:n_samples] Y = Y[:n_samples] print("benchmarking scikit-learn: ") scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) print("benchmarking glmnet: ") glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) xx = np.arange(100, 100 + n * step, step) plt.figure("scikit-learn vs. glmnet benchmark results") plt.title("Regression in high dimensional spaces (%d samples)" % n_samples) plt.plot(xx, scikit_results, "b-", label="scikit-learn") plt.plot(xx, glmnet_results, "r-", label="glmnet") plt.legend() plt.xlabel("number of features") plt.ylabel("Time (s)") plt.axis("tight") plt.show() ================================================ FILE: benchmarks/bench_hist_gradient_boosting.py ================================================ from time import time import argparse import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification from sklearn.datasets import make_regression from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=10) parser.add_argument( "--lightgbm", action="store_true", default=False, help="also plot lightgbm" ) parser.add_argument( "--xgboost", action="store_true", default=False, help="also plot xgboost" ) parser.add_argument( "--catboost", action="store_true", default=False, help="also plot catboost" ) parser.add_argument("--learning-rate", type=float, default=0.1) parser.add_argument( "--problem", type=str, default="classification", choices=["classification", "regression"], ) parser.add_argument("--loss", type=str, default="default") parser.add_argument("--missing-fraction", type=float, default=0) parser.add_argument("--n-classes", type=int, default=2) parser.add_argument("--n-samples-max", type=int, default=int(1e6)) parser.add_argument("--n-features", type=int, default=20) parser.add_argument("--max-bins", type=int, default=255) parser.add_argument( "--random-sample-weights", action="store_true", default=False, help="generate and use random sample weights", ) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes n_trees = args.n_trees lr = args.learning_rate max_bins = args.max_bins def get_estimator_and_data(): if args.problem == "classification": X, y = make_classification( args.n_samples_max * 2, n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, n_informative=args.n_classes, random_state=0, ) return X, y, HistGradientBoostingClassifier elif args.problem == "regression": X, y = make_regression( args.n_samples_max * 2, n_features=args.n_features, random_state=0 ) return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() if args.missing_fraction: mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(bool) X[mask] = np.nan if args.random_sample_weights: sample_weight = np.random.rand(len(X)) * 10 else: sample_weight = None if sample_weight is not None: (X_train_, X_test_, y_train_, y_test_, sample_weight_train_, _) = train_test_split( X, y, sample_weight, test_size=0.5, random_state=0 ) else: X_train_, X_test_, y_train_, y_test_ = train_test_split( X, y, test_size=0.5, random_state=0 ) sample_weight_train_ = None def one_run(n_samples): X_train = X_train_[:n_samples] X_test = X_test_[:n_samples] y_train = y_train_[:n_samples] y_test = y_test_[:n_samples] if sample_weight is not None: sample_weight_train = sample_weight_train_[:n_samples] else: sample_weight_train = None assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples print("Data size: %d samples train, %d samples test." % (n_samples, n_samples)) print("Fitting a sklearn model...") tic = time() est = Estimator( learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, early_stopping=False, random_state=0, verbose=0, ) loss = args.loss if args.problem == "classification": if loss == "default": # loss='auto' does not work with get_equivalent_estimator() loss = ( "binary_crossentropy" if args.n_classes == 2 else "categorical_crossentropy" ) else: # regression if loss == "default": loss = "squared_error" est.set_params(loss=loss) est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic tic = time() sklearn_score = est.score(X_test, y_test) sklearn_score_duration = time() - tic print("score: {:.4f}".format(sklearn_score)) print("fit duration: {:.3f}s,".format(sklearn_fit_duration)) print("score duration: {:.3f}s,".format(sklearn_score_duration)) lightgbm_score = None lightgbm_fit_duration = None lightgbm_score_duration = None if args.lightgbm: print("Fitting a LightGBM model...") lightgbm_est = get_equivalent_estimator( est, lib="lightgbm", n_classes=args.n_classes ) tic = time() lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train) lightgbm_fit_duration = time() - tic tic = time() lightgbm_score = lightgbm_est.score(X_test, y_test) lightgbm_score_duration = time() - tic print("score: {:.4f}".format(lightgbm_score)) print("fit duration: {:.3f}s,".format(lightgbm_fit_duration)) print("score duration: {:.3f}s,".format(lightgbm_score_duration)) xgb_score = None xgb_fit_duration = None xgb_score_duration = None if args.xgboost: print("Fitting an XGBoost model...") xgb_est = get_equivalent_estimator(est, lib="xgboost") tic = time() xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train) xgb_fit_duration = time() - tic tic = time() xgb_score = xgb_est.score(X_test, y_test) xgb_score_duration = time() - tic print("score: {:.4f}".format(xgb_score)) print("fit duration: {:.3f}s,".format(xgb_fit_duration)) print("score duration: {:.3f}s,".format(xgb_score_duration)) cat_score = None cat_fit_duration = None cat_score_duration = None if args.catboost: print("Fitting a CatBoost model...") cat_est = get_equivalent_estimator(est, lib="catboost") tic = time() cat_est.fit(X_train, y_train, sample_weight=sample_weight_train) cat_fit_duration = time() - tic tic = time() cat_score = cat_est.score(X_test, y_test) cat_score_duration = time() - tic print("score: {:.4f}".format(cat_score)) print("fit duration: {:.3f}s,".format(cat_fit_duration)) print("score duration: {:.3f}s,".format(cat_score_duration)) return ( sklearn_score, sklearn_fit_duration, sklearn_score_duration, lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, xgb_score, xgb_fit_duration, xgb_score_duration, cat_score, cat_fit_duration, cat_score_duration, ) n_samples_list = [1000, 10000, 100000, 500000, 1000000, 5000000, 10000000] n_samples_list = [ n_samples for n_samples in n_samples_list if n_samples <= args.n_samples_max ] sklearn_scores = [] sklearn_fit_durations = [] sklearn_score_durations = [] lightgbm_scores = [] lightgbm_fit_durations = [] lightgbm_score_durations = [] xgb_scores = [] xgb_fit_durations = [] xgb_score_durations = [] cat_scores = [] cat_fit_durations = [] cat_score_durations = [] for n_samples in n_samples_list: ( sklearn_score, sklearn_fit_duration, sklearn_score_duration, lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, xgb_score, xgb_fit_duration, xgb_score_duration, cat_score, cat_fit_duration, cat_score_duration, ) = one_run(n_samples) for scores, score in ( (sklearn_scores, sklearn_score), (sklearn_fit_durations, sklearn_fit_duration), (sklearn_score_durations, sklearn_score_duration), (lightgbm_scores, lightgbm_score), (lightgbm_fit_durations, lightgbm_fit_duration), (lightgbm_score_durations, lightgbm_score_duration), (xgb_scores, xgb_score), (xgb_fit_durations, xgb_fit_duration), (xgb_score_durations, xgb_score_duration), (cat_scores, cat_score), (cat_fit_durations, cat_fit_duration), (cat_score_durations, cat_score_duration), ): scores.append(score) fig, axs = plt.subplots(3, sharex=True) axs[0].plot(n_samples_list, sklearn_scores, label="sklearn") axs[1].plot(n_samples_list, sklearn_fit_durations, label="sklearn") axs[2].plot(n_samples_list, sklearn_score_durations, label="sklearn") if args.lightgbm: axs[0].plot(n_samples_list, lightgbm_scores, label="lightgbm") axs[1].plot(n_samples_list, lightgbm_fit_durations, label="lightgbm") axs[2].plot(n_samples_list, lightgbm_score_durations, label="lightgbm") if args.xgboost: axs[0].plot(n_samples_list, xgb_scores, label="XGBoost") axs[1].plot(n_samples_list, xgb_fit_durations, label="XGBoost") axs[2].plot(n_samples_list, xgb_score_durations, label="XGBoost") if args.catboost: axs[0].plot(n_samples_list, cat_scores, label="CatBoost") axs[1].plot(n_samples_list, cat_fit_durations, label="CatBoost") axs[2].plot(n_samples_list, cat_score_durations, label="CatBoost") for ax in axs: ax.set_xscale("log") ax.legend(loc="best") ax.set_xlabel("n_samples") axs[0].set_title("scores") axs[1].set_title("fit duration (s)") axs[2].set_title("score duration (s)") title = args.problem if args.problem == "classification": title += " n_classes = {}".format(args.n_classes) fig.suptitle(title) plt.tight_layout() plt.show() ================================================ FILE: benchmarks/bench_hist_gradient_boosting_adult.py ================================================ import argparse from time import time from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_openml from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=100) parser.add_argument("--lightgbm", action="store_true", default=False) parser.add_argument("--learning-rate", type=float, default=0.1) parser.add_argument("--max-bins", type=int, default=255) parser.add_argument("--no-predict", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes n_trees = args.n_trees lr = args.learning_rate max_bins = args.max_bins verbose = args.verbose def fit(est, data_train, target_train, libname, **fit_params): print(f"Fitting a {libname} model...") tic = time() est.fit(data_train, target_train, **fit_params) toc = time() print(f"fitted in {toc - tic:.3f}s") def predict(est, data_test, target_test): if args.no_predict: return tic = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) toc = time() roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") data = fetch_openml(data_id=179, as_frame=False) # adult dataset X, y = data.data, data.target n_features = X.shape[1] n_categorical_features = len(data.categories) n_numerical_features = n_features - n_categorical_features print(f"Number of features: {n_features}") print(f"Number of categorical features: {n_categorical_features}") print(f"Number of numerical features: {n_numerical_features}") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Note: no need to use an OrdinalEncoder because categorical features are # already clean is_categorical = [name in data.categories for name in data.feature_names] est = HistGradientBoostingClassifier( loss="binary_crossentropy", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, categorical_features=is_categorical, early_stopping=False, random_state=0, verbose=verbose, ) fit(est, X_train, y_train, "sklearn") predict(est, X_test, y_test) if args.lightgbm: est = get_equivalent_estimator(est, lib="lightgbm") est.set_params(max_cat_to_onehot=1) # dont use OHE categorical_features = [ f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat ] fit(est, X_train, y_train, "lightgbm", categorical_feature=categorical_features) predict(est, X_test, y_test) ================================================ FILE: benchmarks/bench_hist_gradient_boosting_categorical_only.py ================================================ import argparse from time import time from sklearn.preprocessing import KBinsDiscretizer from sklearn.datasets import make_classification from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=100) parser.add_argument("--n-features", type=int, default=20) parser.add_argument("--n-cats", type=int, default=20) parser.add_argument("--n-samples", type=int, default=10_000) parser.add_argument("--lightgbm", action="store_true", default=False) parser.add_argument("--learning-rate", type=float, default=0.1) parser.add_argument("--max-bins", type=int, default=255) parser.add_argument("--no-predict", action="store_true", default=False) parser.add_argument("--verbose", action="store_true", default=False) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes n_features = args.n_features n_categories = args.n_cats n_samples = args.n_samples n_trees = args.n_trees lr = args.learning_rate max_bins = args.max_bins verbose = args.verbose def fit(est, data_train, target_train, libname, **fit_params): print(f"Fitting a {libname} model...") tic = time() est.fit(data_train, target_train, **fit_params) toc = time() print(f"fitted in {toc - tic:.3f}s") def predict(est, data_test): # We don't report accuracy or ROC because the dataset doesn't really make # sense: we treat ordered features as un-ordered categories. if args.no_predict: return tic = time() est.predict(data_test) toc = time() print(f"predicted in {toc - tic:.3f}s") X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0) X = KBinsDiscretizer(n_bins=n_categories, encode="ordinal").fit_transform(X) print(f"Number of features: {n_features}") print(f"Number of samples: {n_samples}") is_categorical = [True] * n_features est = HistGradientBoostingClassifier( loss="binary_crossentropy", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, categorical_features=is_categorical, early_stopping=False, random_state=0, verbose=verbose, ) fit(est, X, y, "sklearn") predict(est, X) if args.lightgbm: est = get_equivalent_estimator(est, lib="lightgbm") est.set_params(max_cat_to_onehot=1) # dont use OHE categorical_features = list(range(n_features)) fit(est, X, y, "lightgbm", categorical_feature=categorical_features) predict(est, X) ================================================ FILE: benchmarks/bench_hist_gradient_boosting_higgsboson.py ================================================ from urllib.request import urlretrieve import os from gzip import GzipFile from time import time import argparse import numpy as np import pandas as pd from joblib import Memory from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=10) parser.add_argument("--lightgbm", action="store_true", default=False) parser.add_argument("--xgboost", action="store_true", default=False) parser.add_argument("--catboost", action="store_true", default=False) parser.add_argument("--learning-rate", type=float, default=1.0) parser.add_argument("--subsample", type=int, default=None) parser.add_argument("--max-bins", type=int, default=255) parser.add_argument("--no-predict", action="store_true", default=False) parser.add_argument("--cache-loc", type=str, default="/tmp") args = parser.parse_args() HERE = os.path.dirname(__file__) URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" m = Memory(location=args.cache_loc, mmap_mode="r") n_leaf_nodes = args.n_leaf_nodes n_trees = args.n_trees subsample = args.subsample lr = args.learning_rate max_bins = args.max_bins @m.cache def load_data(): filename = os.path.join(HERE, URL.rsplit("/", 1)[-1]) if not os.path.exists(filename): print(f"Downloading {URL} to {filename} (2.6 GB)...") urlretrieve(URL, filename) print("done.") print(f"Parsing {filename}...") tic = time() with GzipFile(filename) as f: df = pd.read_csv(f, header=None, dtype=np.float32) toc = time() print(f"Loaded {df.values.nbytes / 1e9:0.3f} GB in {toc - tic:0.3f}s") return df def fit(est, data_train, target_train, libname): print(f"Fitting a {libname} model...") tic = time() est.fit(data_train, target_train) toc = time() print(f"fitted in {toc - tic:.3f}s") def predict(est, data_test, target_test): if args.no_predict: return tic = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) toc = time() roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") df = load_data() target = df.values[:, 0] data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=0.2, random_state=0 ) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") est = HistGradientBoostingClassifier( loss="binary_crossentropy", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, early_stopping=False, random_state=0, verbose=1, ) fit(est, data_train, target_train, "sklearn") predict(est, data_test, target_test) if args.lightgbm: est = get_equivalent_estimator(est, lib="lightgbm") fit(est, data_train, target_train, "lightgbm") predict(est, data_test, target_test) if args.xgboost: est = get_equivalent_estimator(est, lib="xgboost") fit(est, data_train, target_train, "xgboost") predict(est, data_test, target_test) if args.catboost: est = get_equivalent_estimator(est, lib="catboost") fit(est, data_train, target_train, "catboost") predict(est, data_test, target_test) ================================================ FILE: benchmarks/bench_hist_gradient_boosting_threading.py ================================================ from time import time import argparse import os from pprint import pprint import numpy as np from threadpoolctl import threadpool_limits import sklearn from sklearn.model_selection import train_test_split from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification from sklearn.datasets import make_regression from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=10) parser.add_argument( "--lightgbm", action="store_true", default=False, help="also benchmark lightgbm" ) parser.add_argument( "--xgboost", action="store_true", default=False, help="also benchmark xgboost" ) parser.add_argument( "--catboost", action="store_true", default=False, help="also benchmark catboost" ) parser.add_argument("--learning-rate", type=float, default=0.1) parser.add_argument( "--problem", type=str, default="classification", choices=["classification", "regression"], ) parser.add_argument("--loss", type=str, default="default") parser.add_argument("--missing-fraction", type=float, default=0) parser.add_argument("--n-classes", type=int, default=2) parser.add_argument("--n-samples", type=int, default=int(1e6)) parser.add_argument("--n-features", type=int, default=100) parser.add_argument("--max-bins", type=int, default=255) parser.add_argument("--print-params", action="store_true", default=False) parser.add_argument( "--random-sample-weights", action="store_true", default=False, help="generate and use random sample weights", ) parser.add_argument( "--plot", action="store_true", default=False, help="show a plot results" ) parser.add_argument( "--plot-filename", default=None, help="filename to save the figure to disk" ) args = parser.parse_args() n_samples = args.n_samples n_leaf_nodes = args.n_leaf_nodes n_trees = args.n_trees lr = args.learning_rate max_bins = args.max_bins print("Data size: %d samples train, %d samples test." % (n_samples, n_samples)) print(f"n_features: {args.n_features}") def get_estimator_and_data(): if args.problem == "classification": X, y = make_classification( args.n_samples * 2, n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, n_informative=args.n_features // 2, random_state=0, ) return X, y, HistGradientBoostingClassifier elif args.problem == "regression": X, y = make_regression( args.n_samples_max * 2, n_features=args.n_features, random_state=0 ) return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() if args.missing_fraction: mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(bool) X[mask] = np.nan if args.random_sample_weights: sample_weight = np.random.rand(len(X)) * 10 else: sample_weight = None if sample_weight is not None: (X_train_, X_test_, y_train_, y_test_, sample_weight_train_, _) = train_test_split( X, y, sample_weight, test_size=0.5, random_state=0 ) else: X_train_, X_test_, y_train_, y_test_ = train_test_split( X, y, test_size=0.5, random_state=0 ) sample_weight_train_ = None sklearn_est = Estimator( learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, early_stopping=False, random_state=0, verbose=0, ) loss = args.loss if args.problem == "classification": if loss == "default": # loss='auto' does not work with get_equivalent_estimator() loss = ( "binary_crossentropy" if args.n_classes == 2 else "categorical_crossentropy" ) else: # regression if loss == "default": loss = "squared_error" sklearn_est.set_params(loss=loss) if args.print_params: print("scikit-learn") pprint(sklearn_est.get_params()) for libname in ["lightgbm", "xgboost", "catboost"]: if getattr(args, libname): print(libname) est = get_equivalent_estimator( sklearn_est, lib=libname, n_classes=args.n_classes ) pprint(est.get_params()) def one_run(n_threads, n_samples): X_train = X_train_[:n_samples] X_test = X_test_[:n_samples] y_train = y_train_[:n_samples] y_test = y_test_[:n_samples] if sample_weight is not None: sample_weight_train = sample_weight_train_[:n_samples] else: sample_weight_train = None assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples print("Fitting a sklearn model...") tic = time() est = sklearn.base.clone(sklearn_est) with threadpool_limits(n_threads, user_api="openmp"): est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic tic = time() sklearn_score = est.score(X_test, y_test) sklearn_score_duration = time() - tic print("score: {:.4f}".format(sklearn_score)) print("fit duration: {:.3f}s,".format(sklearn_fit_duration)) print("score duration: {:.3f}s,".format(sklearn_score_duration)) lightgbm_score = None lightgbm_fit_duration = None lightgbm_score_duration = None if args.lightgbm: print("Fitting a LightGBM model...") lightgbm_est = get_equivalent_estimator( est, lib="lightgbm", n_classes=args.n_classes ) lightgbm_est.set_params(num_threads=n_threads) tic = time() lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train) lightgbm_fit_duration = time() - tic tic = time() lightgbm_score = lightgbm_est.score(X_test, y_test) lightgbm_score_duration = time() - tic print("score: {:.4f}".format(lightgbm_score)) print("fit duration: {:.3f}s,".format(lightgbm_fit_duration)) print("score duration: {:.3f}s,".format(lightgbm_score_duration)) xgb_score = None xgb_fit_duration = None xgb_score_duration = None if args.xgboost: print("Fitting an XGBoost model...") xgb_est = get_equivalent_estimator(est, lib="xgboost") xgb_est.set_params(nthread=n_threads) tic = time() xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train) xgb_fit_duration = time() - tic tic = time() xgb_score = xgb_est.score(X_test, y_test) xgb_score_duration = time() - tic print("score: {:.4f}".format(xgb_score)) print("fit duration: {:.3f}s,".format(xgb_fit_duration)) print("score duration: {:.3f}s,".format(xgb_score_duration)) cat_score = None cat_fit_duration = None cat_score_duration = None if args.catboost: print("Fitting a CatBoost model...") cat_est = get_equivalent_estimator(est, lib="catboost") cat_est.set_params(thread_count=n_threads) tic = time() cat_est.fit(X_train, y_train, sample_weight=sample_weight_train) cat_fit_duration = time() - tic tic = time() cat_score = cat_est.score(X_test, y_test) cat_score_duration = time() - tic print("score: {:.4f}".format(cat_score)) print("fit duration: {:.3f}s,".format(cat_fit_duration)) print("score duration: {:.3f}s,".format(cat_score_duration)) return ( sklearn_score, sklearn_fit_duration, sklearn_score_duration, lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, xgb_score, xgb_fit_duration, xgb_score_duration, cat_score, cat_fit_duration, cat_score_duration, ) max_threads = os.cpu_count() n_threads_list = [2 ** i for i in range(8) if (2 ** i) < max_threads] n_threads_list.append(max_threads) sklearn_scores = [] sklearn_fit_durations = [] sklearn_score_durations = [] lightgbm_scores = [] lightgbm_fit_durations = [] lightgbm_score_durations = [] xgb_scores = [] xgb_fit_durations = [] xgb_score_durations = [] cat_scores = [] cat_fit_durations = [] cat_score_durations = [] for n_threads in n_threads_list: print(f"n_threads: {n_threads}") ( sklearn_score, sklearn_fit_duration, sklearn_score_duration, lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, xgb_score, xgb_fit_duration, xgb_score_duration, cat_score, cat_fit_duration, cat_score_duration, ) = one_run(n_threads, n_samples) for scores, score in ( (sklearn_scores, sklearn_score), (sklearn_fit_durations, sklearn_fit_duration), (sklearn_score_durations, sklearn_score_duration), (lightgbm_scores, lightgbm_score), (lightgbm_fit_durations, lightgbm_fit_duration), (lightgbm_score_durations, lightgbm_score_duration), (xgb_scores, xgb_score), (xgb_fit_durations, xgb_fit_duration), (xgb_score_durations, xgb_score_duration), (cat_scores, cat_score), (cat_fit_durations, cat_fit_duration), (cat_score_durations, cat_score_duration), ): scores.append(score) if args.plot or args.plot_filename: import matplotlib.pyplot as plt import matplotlib fig, axs = plt.subplots(2, figsize=(12, 12)) label = f"sklearn {sklearn.__version__}" axs[0].plot(n_threads_list, sklearn_fit_durations, label=label) axs[1].plot(n_threads_list, sklearn_score_durations, label=label) if args.lightgbm: import lightgbm label = f"LightGBM {lightgbm.__version__}" axs[0].plot(n_threads_list, lightgbm_fit_durations, label=label) axs[1].plot(n_threads_list, lightgbm_score_durations, label=label) if args.xgboost: import xgboost label = f"XGBoost {xgboost.__version__}" axs[0].plot(n_threads_list, xgb_fit_durations, label=label) axs[1].plot(n_threads_list, xgb_score_durations, label=label) if args.catboost: import catboost label = f"CatBoost {catboost.__version__}" axs[0].plot(n_threads_list, cat_fit_durations, label=label) axs[1].plot(n_threads_list, cat_score_durations, label=label) for ax in axs: ax.set_xscale("log") ax.set_xlabel("n_threads") ax.set_ylabel("duration (s)") ax.set_ylim(0, None) ax.set_xticks(n_threads_list) ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) ax.legend(loc="best") axs[0].set_title("fit duration (s)") axs[1].set_title("score duration (s)") title = args.problem if args.problem == "classification": title += " n_classes = {}".format(args.n_classes) fig.suptitle(title) plt.tight_layout() if args.plot_filename: plt.savefig(args.plot_filename) if args.plot: plt.show() ================================================ FILE: benchmarks/bench_isolation_forest.py ================================================ """ ========================================== IsolationForest benchmark ========================================== A test of IsolationForest on classical anomaly detection datasets. The benchmark is run as follows: 1. The dataset is randomly split into a training set and a test set, both assumed to contain outliers. 2. Isolation Forest is trained on the training set. 3. The ROC curve is computed on the test set using the knowledge of the labels. Note that the smtp dataset contains a very small proportion of outliers. Therefore, depending on the seed of the random number generator, randomly splitting the data set might lead to a test set containing no outliers. In this case a warning is raised when computing the ROC curve. """ from time import time import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import IsolationForest from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle as sh print(__doc__) def print_outlier_ratio(y): """ Helper function to show the distinct value count of element in the target. Useful indicator for the datasets used in bench_isolation_forest.py. """ uniq, cnt = np.unique(y, return_counts=True) print("----- Target count values: ") for u, c in zip(uniq, cnt): print("------ %s -> %d occurrences" % (str(u), c)) print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) random_state = 1 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) # Set this to true for plotting score histograms for each dataset: with_decision_function_histograms = False # datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"] # Loop over all datasets for fitting and scoring the estimator: for dat in datasets: # Loading and vectorizing the data: print("====== %s ======" % dat) print("--- Fetching data...") if dat in ["http", "smtp", "SF", "SA"]: dataset = fetch_kddcup99( subset=dat, shuffle=True, percent10=True, random_state=random_state ) X = dataset.data y = dataset.target if dat == "shuttle": dataset = fetch_openml("shuttle") X = dataset.data y = dataset.target X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 s = y != 4 X = X[s, :] y = y[s] y = (y != 1).astype(int) print("----- ") if dat == "forestcover": dataset = fetch_covtype(shuffle=True, random_state=random_state) X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) print_outlier_ratio(y) print("--- Vectorizing data...") if dat == "SF": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b"normal.").astype(int) print_outlier_ratio(y) if dat == "SA": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b"normal.").astype(int) print_outlier_ratio(y) if dat in ("http", "smtp"): y = (y != b"normal.").astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape n_samples_train = n_samples // 2 X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print("--- Fitting the IsolationForest estimator...") model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color="black") ax[0].set_title("Decision function for %s dataset" % dat) ax[1].hist(scoring[y_test == 0], bins, color="b", label="normal data") ax[1].legend(loc="lower right") ax[2].hist(scoring[y_test == 1], bins, color="r", label="outliers") ax[2].legend(loc="lower right") # Show ROC Curves predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring) auc_score = auc(fpr, tpr) label = "%s (AUC: %0.3f, train_time= %0.2fs, test_time= %0.2fs)" % ( dat, auc_score, fit_time, predict_time, ) # Print AUC score and train/test time: print(label) ax_roc.plot(fpr, tpr, lw=1, label=label) ax_roc.set_xlim([-0.05, 1.05]) ax_roc.set_ylim([-0.05, 1.05]) ax_roc.set_xlabel("False Positive Rate") ax_roc.set_ylabel("True Positive Rate") ax_roc.set_title("Receiver operating characteristic (ROC) curves") ax_roc.legend(loc="lower right") fig_roc.tight_layout() plt.show() ================================================ FILE: benchmarks/bench_isotonic.py ================================================ """ Benchmarks of isotonic regression performance. We generate a synthetic dataset of size 10^n, for n in [min, max], and examine the time taken to run isotonic regression over the dataset. The timings are then output to stdout, or visualized on a log-log scale with matplotlib. This allows the scaling of the algorithm with the problem size to be visualized and understood. """ import numpy as np import gc from datetime import datetime from sklearn.isotonic import isotonic_regression from scipy.special import expit import matplotlib.pyplot as plt import argparse def generate_perturbed_logarithm_dataset(size): return np.random.randint(-50, 50, size=size) + 50.0 * np.log(1 + np.arange(size)) def generate_logistic_dataset(size): X = np.sort(np.random.normal(size=size)) return np.random.random(size=size) < expit(X) def generate_pathological_dataset(size): # Triggers O(n^2) complexity on the original implementation. return np.r_[ np.arange(size), np.arange(-(size - 1), size), np.arange(-(size - 1), 1) ] DATASET_GENERATORS = { "perturbed_logarithm": generate_perturbed_logarithm_dataset, "logistic": generate_logistic_dataset, "pathological": generate_pathological_dataset, } def bench_isotonic_regression(Y): """ Runs a single iteration of isotonic regression on the input data, and reports the total time taken (in seconds). """ gc.collect() tstart = datetime.now() isotonic_regression(Y) return (datetime.now() - tstart).total_seconds() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Isotonic Regression benchmark tool") parser.add_argument("--seed", type=int, help="RNG seed") parser.add_argument( "--iterations", type=int, required=True, help="Number of iterations to average timings over for each problem size", ) parser.add_argument( "--log_min_problem_size", type=int, required=True, help="Base 10 logarithm of the minimum problem size", ) parser.add_argument( "--log_max_problem_size", type=int, required=True, help="Base 10 logarithm of the maximum problem size", ) parser.add_argument( "--show_plot", action="store_true", help="Plot timing output with matplotlib" ) parser.add_argument("--dataset", choices=DATASET_GENERATORS.keys(), required=True) args = parser.parse_args() np.random.seed(args.seed) timings = [] for exponent in range(args.log_min_problem_size, args.log_max_problem_size): n = 10 ** exponent Y = DATASET_GENERATORS[args.dataset](n) time_per_iteration = [ bench_isotonic_regression(Y) for i in range(args.iterations) ] timing = (n, np.mean(time_per_iteration)) timings.append(timing) # If we're not plotting, dump the timing to stdout if not args.show_plot: print(n, np.mean(time_per_iteration)) if args.show_plot: plt.plot(*zip(*timings)) plt.title("Average time taken running isotonic regression") plt.xlabel("Number of observations") plt.ylabel("Time (s)") plt.axis("tight") plt.loglog() plt.show() ================================================ FILE: benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py ================================================ """ ============================================================= Kernel PCA Solvers comparison benchmark: time vs n_components ============================================================= This benchmark shows that the approximate solvers provided in Kernel PCA can help significantly improve its execution speed when an approximate solution (small `n_components`) is acceptable. In many real-world datasets a few hundreds of principal components are indeed sufficient enough to capture the underlying distribution. Description: ------------ A fixed number of training (default: 2000) and test (default: 1000) samples with 2 features is generated using the `make_circles` helper method. KernelPCA models are trained on the training set with an increasing number of principal components, between 1 and `max_n_compo` (default: 1999), with `n_compo_grid_size` positions (default: 10). For each value of `n_components` to try, KernelPCA models are trained for the various possible `eigen_solver` values. The execution times are displayed in a plot at the end of the experiment. What you can observe: --------------------- When the number of requested principal components is small, the dense solver takes more time to complete, while the randomized method returns similar results with shorter execution times. Going further: -------------- You can adjust `max_n_compo` and `n_compo_grid_size` if you wish to explore a different range of values for `n_components`. You can also set `arpack_all=True` to activate arpack solver for large number of components (this takes more time). """ # Authors: Sylvain MARIE, Schneider Electric import time import numpy as np import matplotlib.pyplot as plt from numpy.testing import assert_array_almost_equal from sklearn.decomposition import KernelPCA from sklearn.datasets import make_circles print(__doc__) # 1- Design the Experiment # ------------------------ n_train, n_test = 2000, 1000 # the sample sizes to use max_n_compo = 1999 # max n_components to try n_compo_grid_size = 10 # nb of positions in the grid to try # generate the grid n_compo_range = [ np.round(np.exp((x / (n_compo_grid_size - 1)) * np.log(max_n_compo))) for x in range(0, n_compo_grid_size) ] n_iter = 3 # the number of times each experiment will be repeated arpack_all = False # set to True if you wish to run arpack for all n_compo # 2- Generate random data # ----------------------- n_features = 2 X, y = make_circles( n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0 ) X_train, X_test = X[:n_train, :], X[n_train:, :] # 3- Benchmark # ------------ # init ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan a_time = np.empty((len(n_compo_range), n_iter)) * np.nan r_time = np.empty((len(n_compo_range), n_iter)) * np.nan # loop for j, n_components in enumerate(n_compo_range): n_components = int(n_components) print("Performing kPCA with n_components = %i" % n_components) # A- reference (dense) print(" - dense solver") for i in range(n_iter): start_time = time.perf_counter() ref_pred = ( KernelPCA(n_components, eigen_solver="dense").fit(X_train).transform(X_test) ) ref_time[j, i] = time.perf_counter() - start_time # B- arpack (for small number of components only, too slow otherwise) if arpack_all or n_components < 100: print(" - arpack solver") for i in range(n_iter): start_time = time.perf_counter() a_pred = ( KernelPCA(n_components, eigen_solver="arpack") .fit(X_train) .transform(X_test) ) a_time[j, i] = time.perf_counter() - start_time # check that the result is still correct despite the approx assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) # C- randomized print(" - randomized solver") for i in range(n_iter): start_time = time.perf_counter() r_pred = ( KernelPCA(n_components, eigen_solver="randomized") .fit(X_train) .transform(X_test) ) r_time[j, i] = time.perf_counter() - start_time # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) # Compute statistics for the 3 methods avg_ref_time = ref_time.mean(axis=1) std_ref_time = ref_time.std(axis=1) avg_a_time = a_time.mean(axis=1) std_a_time = a_time.std(axis=1) avg_r_time = r_time.mean(axis=1) std_r_time = r_time.std(axis=1) # 4- Plots # -------- fig, ax = plt.subplots(figsize=(12, 8)) # Display 1 plot with error bars per method ax.errorbar( n_compo_range, avg_ref_time, yerr=std_ref_time, marker="x", linestyle="", color="r", label="full", ) ax.errorbar( n_compo_range, avg_a_time, yerr=std_a_time, marker="x", linestyle="", color="g", label="arpack", ) ax.errorbar( n_compo_range, avg_r_time, yerr=std_r_time, marker="x", linestyle="", color="b", label="randomized", ) ax.legend(loc="upper left") # customize axes ax.set_xscale("log") ax.set_xlim(1, max(n_compo_range) * 1.1) ax.set_ylabel("Execution time (s)") ax.set_xlabel("n_components") ax.set_title( "kPCA Execution time comparison on %i samples with %i " "features, according to the choice of `eigen_solver`" "" % (n_train, n_features) ) plt.show() ================================================ FILE: benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py ================================================ """ ========================================================== Kernel PCA Solvers comparison benchmark: time vs n_samples ========================================================== This benchmark shows that the approximate solvers provided in Kernel PCA can help significantly improve its execution speed when an approximate solution (small `n_components`) is acceptable. In many real-world datasets the number of samples is very large, but a few hundreds of principal components are sufficient enough to capture the underlying distribution. Description: ------------ An increasing number of examples is used to train a KernelPCA, between `min_n_samples` (default: 101) and `max_n_samples` (default: 4000) with `n_samples_grid_size` positions (default: 4). Samples have 2 features, and are generated using `make_circles`. For each training sample size, KernelPCA models are trained for the various possible `eigen_solver` values. All of them are trained to obtain `n_components` principal components (default: 100). The execution times are displayed in a plot at the end of the experiment. What you can observe: --------------------- When the number of samples provided gets large, the dense solver takes a lot of time to complete, while the randomized method returns similar results in much shorter execution times. Going further: -------------- You can increase `max_n_samples` and `nb_n_samples_to_try` if you wish to explore a wider range of values for `n_samples`. You can also set `include_arpack=True` to add this other solver in the experiments (much slower). Finally you can have a look at the second example of this series, "Kernel PCA Solvers comparison benchmark: time vs n_components", where this time the number of examples is fixed, and the desired number of components varies. """ # Author: Sylvain MARIE, Schneider Electric import time import numpy as np import matplotlib.pyplot as plt from numpy.testing import assert_array_almost_equal from sklearn.decomposition import KernelPCA from sklearn.datasets import make_circles print(__doc__) # 1- Design the Experiment # ------------------------ min_n_samples, max_n_samples = 101, 4000 # min and max n_samples to try n_samples_grid_size = 4 # nb of positions in the grid to try # generate the grid n_samples_range = [ min_n_samples + np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples)) for x in range(0, n_samples_grid_size) ] n_components = 100 # the number of principal components we want to use n_iter = 3 # the number of times each experiment will be repeated include_arpack = False # set this to True to include arpack solver (slower) # 2- Generate random data # ----------------------- n_features = 2 X, y = make_circles(n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0) # 3- Benchmark # ------------ # init ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan a_time = np.empty((len(n_samples_range), n_iter)) * np.nan r_time = np.empty((len(n_samples_range), n_iter)) * np.nan # loop for j, n_samples in enumerate(n_samples_range): n_samples = int(n_samples) print("Performing kPCA with n_samples = %i" % n_samples) X_train = X[:n_samples, :] X_test = X_train # A- reference (dense) print(" - dense") for i in range(n_iter): start_time = time.perf_counter() ref_pred = ( KernelPCA(n_components, eigen_solver="dense").fit(X_train).transform(X_test) ) ref_time[j, i] = time.perf_counter() - start_time # B- arpack if include_arpack: print(" - arpack") for i in range(n_iter): start_time = time.perf_counter() a_pred = ( KernelPCA(n_components, eigen_solver="arpack") .fit(X_train) .transform(X_test) ) a_time[j, i] = time.perf_counter() - start_time # check that the result is still correct despite the approx assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) # C- randomized print(" - randomized") for i in range(n_iter): start_time = time.perf_counter() r_pred = ( KernelPCA(n_components, eigen_solver="randomized") .fit(X_train) .transform(X_test) ) r_time[j, i] = time.perf_counter() - start_time # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) # Compute statistics for the 3 methods avg_ref_time = ref_time.mean(axis=1) std_ref_time = ref_time.std(axis=1) avg_a_time = a_time.mean(axis=1) std_a_time = a_time.std(axis=1) avg_r_time = r_time.mean(axis=1) std_r_time = r_time.std(axis=1) # 4- Plots # -------- fig, ax = plt.subplots(figsize=(12, 8)) # Display 1 plot with error bars per method ax.errorbar( n_samples_range, avg_ref_time, yerr=std_ref_time, marker="x", linestyle="", color="r", label="full", ) if include_arpack: ax.errorbar( n_samples_range, avg_a_time, yerr=std_a_time, marker="x", linestyle="", color="g", label="arpack", ) ax.errorbar( n_samples_range, avg_r_time, yerr=std_r_time, marker="x", linestyle="", color="b", label="randomized", ) ax.legend(loc="upper left") # customize axes ax.set_xlim(min(n_samples_range) * 0.9, max(n_samples_range) * 1.1) ax.set_ylabel("Execution time (s)") ax.set_xlabel("n_samples") ax.set_title( "Execution time comparison of kPCA with %i components on samples " "with %i features, according to the choice of `eigen_solver`" "" % (n_components, n_features) ) plt.show() ================================================ FILE: benchmarks/bench_lasso.py ================================================ """ Benchmarks of Lasso vs LassoLars First, we fix a training set and increase the number of samples. Then we plot the computation time as function of the number of samples. In the second benchmark, we increase the number of dimensions of the training set. Then we plot the computation time as function of the number of dimensions. In both cases, only 10% of the features are informative. """ import gc from time import time import numpy as np from sklearn.datasets import make_regression def compute_bench(alpha, n_samples, n_features, precompute): lasso_results = [] lars_lasso_results = [] it = 0 for ns in n_samples: for nf in n_features: it += 1 print("==================") print("Iteration %s of %s" % (it, max(len(n_samples), len(n_features)))) print("==================") n_informative = nf // 10 X, Y, coef_ = make_regression( n_samples=ns, n_features=nf, n_informative=n_informative, noise=0.1, coef=True, ) X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data gc.collect() print("- benchmarking Lasso") clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print("- benchmarking LassoLars") clf = LassoLars( alpha=alpha, fit_intercept=False, normalize=False, precompute=precompute ) tstart = time() clf.fit(X, Y) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results if __name__ == "__main__": from sklearn.linear_model import Lasso, LassoLars import matplotlib.pyplot as plt alpha = 0.01 # regularization parameter n_features = 10 list_n_samples = np.linspace(100, 1000000, 5).astype(int) lasso_results, lars_lasso_results = compute_bench( alpha, list_n_samples, [n_features], precompute=True ) plt.figure("scikit-learn LASSO benchmark results") plt.subplot(211) plt.plot(list_n_samples, lasso_results, "b-", label="Lasso") plt.plot(list_n_samples, lars_lasso_results, "r-", label="LassoLars") plt.title("precomputed Gram matrix, %d features, alpha=%s" % (n_features, alpha)) plt.legend(loc="upper left") plt.xlabel("number of samples") plt.ylabel("Time (s)") plt.axis("tight") n_samples = 2000 list_n_features = np.linspace(500, 3000, 5).astype(int) lasso_results, lars_lasso_results = compute_bench( alpha, [n_samples], list_n_features, precompute=False ) plt.subplot(212) plt.plot(list_n_features, lasso_results, "b-", label="Lasso") plt.plot(list_n_features, lars_lasso_results, "r-", label="LassoLars") plt.title("%d samples, alpha=%s" % (n_samples, alpha)) plt.legend(loc="upper left") plt.xlabel("number of features") plt.ylabel("Time (s)") plt.axis("tight") plt.show() ================================================ FILE: benchmarks/bench_lof.py ================================================ """ ============================ LocalOutlierFactor benchmark ============================ A test of LocalOutlierFactor on classical anomaly detection datasets. Note that LocalOutlierFactor is not meant to predict on a test set and its performance is assessed in an outlier detection context: 1. The model is trained on the whole dataset which is assumed to contain outliers. 2. The ROC curve is computed on the same dataset using the knowledge of the labels. In this context there is no need to shuffle the dataset because the model is trained and tested on the whole dataset. The randomness of this benchmark is only caused by the random selection of anomalies in the SA dataset. """ from time import time import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import LocalOutlierFactor from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml from sklearn.preprocessing import LabelBinarizer print(__doc__) random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"] plt.figure() for dataset_name in datasets: # loading and vectorization print("loading data") if dataset_name in ["http", "smtp", "SA", "SF"]: dataset = fetch_kddcup99( subset=dataset_name, percent10=True, random_state=random_state ) X = dataset.data y = dataset.target if dataset_name == "shuttle": dataset = fetch_openml("shuttle") X = dataset.data y = dataset.target # we remove data with label 4 # normal data are then those of class 1 s = y != 4 X = X[s, :] y = y[s] y = (y != 1).astype(int) if dataset_name == "forestcover": dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) print("vectorizing data") if dataset_name == "SF": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b"normal.").astype(int) if dataset_name == "SA": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b"normal.").astype(int) if dataset_name == "http" or dataset_name == "smtp": y = (y != b"normal.").astype(int) X = X.astype(float) print("LocalOutlierFactor processing...") model = LocalOutlierFactor(n_neighbors=20) tstart = time() model.fit(X) fit_time = time() - tstart scoring = -model.negative_outlier_factor_ # the lower, the more normal fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) plt.plot( fpr, tpr, lw=1, label="ROC for %s (area = %0.3f, train-time: %0.2fs)" % (dataset_name, AUC, fit_time), ) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic") plt.legend(loc="lower right") plt.show() ================================================ FILE: benchmarks/bench_mnist.py ================================================ """ ======================= MNIST dataset benchmark ======================= Benchmark on the MNIST dataset. The dataset comprises 70,000 samples and 784 features. Here, we consider the task of predicting 10 classes - digits from 0 to 9 from their raw images. By contrast to the covertype dataset, the feature space is homogeneous. Example of output : [..] Classification performance: =========================== Classifier train-time test-time error-rate ------------------------------------------------------------ MLP_adam 53.46s 0.11s 0.0224 Nystroem-SVM 112.97s 0.92s 0.0228 MultilayerPerceptron 24.33s 0.14s 0.0287 ExtraTrees 42.99s 0.57s 0.0294 RandomForest 42.70s 0.49s 0.0318 SampledRBF-SVM 135.81s 0.56s 0.0486 LinearRegression-SAG 16.67s 0.06s 0.0824 CART 20.69s 0.02s 0.1219 dummy 0.00s 0.01s 0.8973 """ # Author: Issam H. Laradji # Arnaud Joly # License: BSD 3 clause import os from time import time import argparse import numpy as np from joblib import Memory from sklearn.datasets import fetch_openml from sklearn.datasets import get_data_home from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), "mnist_benchmark_data"), mmap_mode="r") @memory.cache def load_data(dtype=np.float32, order="F"): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml("mnist_784") X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), "CART": DecisionTreeClassifier(), "ExtraTrees": ExtraTreesClassifier(), "RandomForest": RandomForestClassifier(), "Nystroem-SVM": make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100) ), "SampledRBF-SVM": make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100) ), "LogisticRegression-SAG": LogisticRegression(solver="sag", tol=1e-1, C=1e4), "LogisticRegression-SAGA": LogisticRegression(solver="saga", tol=1e-1, C=1e4), "MultilayerPerceptron": MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver="sgd", learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1, ), "MLP-adam": MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver="adam", learning_rate_init=0.001, verbose=1, tol=1e-4, random_state=1, ), } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--classifiers", nargs="+", choices=ESTIMATORS, type=str, default=["ExtraTrees", "Nystroem-SVM"], help="list of classifiers to benchmark.", ) parser.add_argument( "--n-jobs", nargs="?", default=1, type=int, help=( "Number of concurrently running workers for " "models that support parallelism." ), ) parser.add_argument( "--order", nargs="?", default="C", type=str, choices=["F", "C"], help="Allow to choose between fortran and C ordered data", ) parser.add_argument( "--random-seed", nargs="?", default=0, type=int, help="Common seed used by random number generator.", ) args = vars(parser.parse_args()) print(__doc__) X_train, X_test, y_train, y_test = load_data(order=args["order"]) print("") print("Dataset statistics:") print("===================") print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) print( "%s %d (size=%dMB)" % ( "number of train samples:".ljust(25), X_train.shape[0], int(X_train.nbytes / 1e6), ) ) print( "%s %d (size=%dMB)" % ( "number of test samples:".ljust(25), X_test.shape[0], int(X_test.nbytes / 1e6), ) ) print() print("Training Classifiers") print("====================") error, train_time, test_time = {}, {}, {} for name in sorted(args["classifiers"]): print("Training %s ... " % name, end="") estimator = ESTIMATORS[name] estimator_params = estimator.get_params() estimator.set_params( **{ p: args["random_seed"] for p in estimator_params if p.endswith("random_state") } ) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=args["n_jobs"]) time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print( "{0: <24} {1: >10} {2: >11} {3: >12}".format( "Classifier ", "train-time", "test-time", "error-rate" ) ) print("-" * 60) for name in sorted(args["classifiers"], key=error.get): print( "{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}".format( name, train_time[name], test_time[name], error[name] ) ) print() ================================================ FILE: benchmarks/bench_multilabel_metrics.py ================================================ #!/usr/bin/env python """ A comparison of multilabel target formats and metrics over them """ from timeit import timeit from functools import partial import itertools import argparse import sys import matplotlib.pyplot as plt import scipy.sparse as sp import numpy as np from sklearn.datasets import make_multilabel_classification from sklearn.metrics import ( f1_score, accuracy_score, hamming_loss, jaccard_similarity_score, ) from sklearn.utils._testing import ignore_warnings METRICS = { "f1": partial(f1_score, average="micro"), "f1-by-sample": partial(f1_score, average="samples"), "accuracy": accuracy_score, "hamming": hamming_loss, "jaccard": jaccard_similarity_score, } FORMATS = { "sequences": lambda y: [list(np.flatnonzero(s)) for s in y], "dense": lambda y: y, "csr": lambda y: sp.csr_matrix(y), "csc": lambda y: sp.csc_matrix(y), } @ignore_warnings def benchmark( metrics=tuple(v for k, v in sorted(METRICS.items())), formats=tuple(v for k, v in sorted(FORMATS.items())), samples=1000, classes=4, density=0.2, n_times=5, ): """Times metric calculations for a number of inputs Parameters ---------- metrics : array-like of callables (1d or 0d) The metric functions to time. formats : array-like of callables (1d or 0d) These may transform a dense indicator matrix into multilabel representation. samples : array-like of ints (1d or 0d) The number of samples to generate as input. classes : array-like of ints (1d or 0d) The number of classes in the input. density : array-like of ints (1d or 0d) The density of positive labels in the input. n_times : int Time calling the metric n_times times. Returns ------- array of floats shaped like (metrics, formats, samples, classes, density) Time in seconds. """ metrics = np.atleast_1d(metrics) samples = np.atleast_1d(samples) classes = np.atleast_1d(classes) density = np.atleast_1d(density) formats = np.atleast_1d(formats) out = np.zeros( (len(metrics), len(formats), len(samples), len(classes), len(density)), dtype=float, ) it = itertools.product(samples, classes, density) for i, (s, c, d) in enumerate(it): _, y_true = make_multilabel_classification( n_samples=s, n_features=1, n_classes=c, n_labels=d * c, random_state=42 ) _, y_pred = make_multilabel_classification( n_samples=s, n_features=1, n_classes=c, n_labels=d * c, random_state=84 ) for j, f in enumerate(formats): f_true = f(y_true) f_pred = f(y_pred) for k, metric in enumerate(metrics): t = timeit(partial(metric, f_true, f_pred), number=n_times) out[k, j].flat[i] = t return out def _tabulate(results, metrics, formats): """Prints results by metric and format Uses the last ([-1]) value of other fields """ column_width = max(max(len(k) for k in formats) + 1, 8) first_width = max(len(k) for k in metrics) head_fmt = "{:<{fw}s}" + "{:>{cw}s}" * len(formats) row_fmt = "{:<{fw}s}" + "{:>{cw}.3f}" * len(formats) print(head_fmt.format("Metric", *formats, cw=column_width, fw=first_width)) for metric, row in zip(metrics, results[:, :, -1, -1, -1]): print(row_fmt.format(metric, *row, cw=column_width, fw=first_width)) def _plot( results, metrics, formats, title, x_ticks, x_label, format_markers=("x", "|", "o", "+"), metric_colors=("c", "m", "y", "k", "g", "r", "b"), ): """ Plot the results by metric, format and some other variable given by x_label """ fig = plt.figure("scikit-learn multilabel metrics benchmarks") plt.title(title) ax = fig.add_subplot(111) for i, metric in enumerate(metrics): for j, format in enumerate(formats): ax.plot( x_ticks, results[i, j].flat, label="{}, {}".format(metric, format), marker=format_markers[j], color=metric_colors[i % len(metric_colors)], ) ax.set_xlabel(x_label) ax.set_ylabel("Time (s)") ax.legend() plt.show() if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument( "metrics", nargs="*", default=sorted(METRICS), help="Specifies metrics to benchmark, defaults to all. Choices are: {}".format( sorted(METRICS) ), ) ap.add_argument( "--formats", nargs="+", choices=sorted(FORMATS), help="Specifies multilabel formats to benchmark (defaults to all).", ) ap.add_argument( "--samples", type=int, default=1000, help="The number of samples to generate" ) ap.add_argument("--classes", type=int, default=10, help="The number of classes") ap.add_argument( "--density", type=float, default=0.2, help="The average density of labels per sample", ) ap.add_argument( "--plot", choices=["classes", "density", "samples"], default=None, help=( "Plot time with respect to this parameter varying up to the specified value" ), ) ap.add_argument( "--n-steps", default=10, type=int, help="Plot this many points for each metric" ) ap.add_argument( "--n-times", default=5, type=int, help="Time performance over n_times trials" ) args = ap.parse_args() if args.plot is not None: max_val = getattr(args, args.plot) if args.plot in ("classes", "samples"): min_val = 2 else: min_val = 0 steps = np.linspace(min_val, max_val, num=args.n_steps + 1)[1:] if args.plot in ("classes", "samples"): steps = np.unique(np.round(steps).astype(int)) setattr(args, args.plot, steps) if args.metrics is None: args.metrics = sorted(METRICS) if args.formats is None: args.formats = sorted(FORMATS) results = benchmark( [METRICS[k] for k in args.metrics], [FORMATS[k] for k in args.formats], args.samples, args.classes, args.density, args.n_times, ) _tabulate(results, args.metrics, args.formats) if args.plot is not None: print("Displaying plot", file=sys.stderr) title = "Multilabel metrics with %s" % ", ".join( "{0}={1}".format(field, getattr(args, field)) for field in ["samples", "classes", "density"] if args.plot != field ) _plot(results, args.metrics, args.formats, title, steps, args.plot) ================================================ FILE: benchmarks/bench_online_ocsvm.py ================================================ """ ===================================== SGDOneClassSVM benchmark ===================================== This benchmark compares the :class:`SGDOneClassSVM` with :class:`OneClassSVM`. The former is an online One-Class SVM implemented with a Stochastic Gradient Descent (SGD). The latter is based on the LibSVM implementation. The complexity of :class:`SGDOneClassSVM` is linear in the number of samples whereas the one of :class:`OneClassSVM` is at best quadratic in the number of samples. We here compare the performance in terms of AUC and training time on classical anomaly detection datasets. The :class:`OneClassSVM` is applied with a Gaussian kernel and we therefore use a kernel approximation prior to the application of :class:`SGDOneClassSVM`. """ from time import time import numpy as np from scipy.interpolate import interp1d from sklearn.metrics import roc_curve, auc from sklearn.datasets import fetch_kddcup99, fetch_covtype from sklearn.preprocessing import LabelBinarizer, StandardScaler from sklearn.pipeline import make_pipeline from sklearn.utils import shuffle from sklearn.kernel_approximation import Nystroem from sklearn.svm import OneClassSVM from sklearn.linear_model import SGDOneClassSVM import matplotlib.pyplot as plt import matplotlib font = {"weight": "normal", "size": 15} matplotlib.rc("font", **font) print(__doc__) def print_outlier_ratio(y): """ Helper function to show the distinct value count of element in the target. Useful indicator for the datasets used in bench_isolation_forest.py. """ uniq, cnt = np.unique(y, return_counts=True) print("----- Target count values: ") for u, c in zip(uniq, cnt): print("------ %s -> %d occurrences" % (str(u), c)) print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) # for roc curve computation n_axis = 1000 x_axis = np.linspace(0, 1, n_axis) datasets = ["http", "smtp", "SA", "SF", "forestcover"] novelty_detection = False # if False, training set polluted by outliers random_states = [42] nu = 0.05 results_libsvm = np.empty((len(datasets), n_axis + 5)) results_online = np.empty((len(datasets), n_axis + 5)) for dat, dataset_name in enumerate(datasets): print(dataset_name) # Loading datasets if dataset_name in ["http", "smtp", "SA", "SF"]: dataset = fetch_kddcup99( subset=dataset_name, shuffle=False, percent10=False, random_state=88 ) X = dataset.data y = dataset.target if dataset_name == "forestcover": dataset = fetch_covtype(shuffle=False) X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) # Vectorizing data if dataset_name == "SF": # Casting type of X (object) as string is needed for string categorical # features to apply LabelBinarizer lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b"normal.").astype(int) if dataset_name == "SA": lb = LabelBinarizer() # Casting type of X (object) as string is needed for string categorical # features to apply LabelBinarizer x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b"normal.").astype(int) if dataset_name in ["http", "smtp"]: y = (y != b"normal.").astype(int) print_outlier_ratio(y) n_samples, n_features = np.shape(X) if dataset_name == "SA": # LibSVM too long with n_samples // 2 n_samples_train = n_samples // 20 else: n_samples_train = n_samples // 2 n_samples_test = n_samples - n_samples_train print("n_train: ", n_samples_train) print("n_features: ", n_features) tpr_libsvm = np.zeros(n_axis) tpr_online = np.zeros(n_axis) fit_time_libsvm = 0 fit_time_online = 0 predict_time_libsvm = 0 predict_time_online = 0 X = X.astype(float) gamma = 1 / n_features # OCSVM default parameter for random_state in random_states: print("random state: %s" % random_state) X, y = shuffle(X, y, random_state=random_state) X_train = X[:n_samples_train] X_test = X[n_samples_train:] y_train = y[:n_samples_train] y_test = y[n_samples_train:] if novelty_detection: X_train = X_train[y_train == 0] y_train = y_train[y_train == 0] std = StandardScaler() print("----------- LibSVM OCSVM ------------") ocsvm = OneClassSVM(kernel="rbf", gamma=gamma, nu=nu) pipe_libsvm = make_pipeline(std, ocsvm) tstart = time() pipe_libsvm.fit(X_train) fit_time_libsvm += time() - tstart tstart = time() # scoring such that the lower, the more normal scoring = -pipe_libsvm.decision_function(X_test) predict_time_libsvm += time() - tstart fpr_libsvm_, tpr_libsvm_, _ = roc_curve(y_test, scoring) f_libsvm = interp1d(fpr_libsvm_, tpr_libsvm_) tpr_libsvm += f_libsvm(x_axis) print("----------- Online OCSVM ------------") nystroem = Nystroem(gamma=gamma, random_state=random_state) online_ocsvm = SGDOneClassSVM(nu=nu, random_state=random_state) pipe_online = make_pipeline(std, nystroem, online_ocsvm) tstart = time() pipe_online.fit(X_train) fit_time_online += time() - tstart tstart = time() # scoring such that the lower, the more normal scoring = -pipe_online.decision_function(X_test) predict_time_online += time() - tstart fpr_online_, tpr_online_, _ = roc_curve(y_test, scoring) f_online = interp1d(fpr_online_, tpr_online_) tpr_online += f_online(x_axis) tpr_libsvm /= len(random_states) tpr_libsvm[0] = 0.0 fit_time_libsvm /= len(random_states) predict_time_libsvm /= len(random_states) auc_libsvm = auc(x_axis, tpr_libsvm) results_libsvm[dat] = [ fit_time_libsvm, predict_time_libsvm, auc_libsvm, n_samples_train, n_features, ] + list(tpr_libsvm) tpr_online /= len(random_states) tpr_online[0] = 0.0 fit_time_online /= len(random_states) predict_time_online /= len(random_states) auc_online = auc(x_axis, tpr_online) results_online[dat] = [ fit_time_online, predict_time_online, auc_online, n_samples_train, n_features, ] + list(tpr_libsvm) # -------- Plotting bar charts ------------- fit_time_libsvm_all = results_libsvm[:, 0] predict_time_libsvm_all = results_libsvm[:, 1] auc_libsvm_all = results_libsvm[:, 2] n_train_all = results_libsvm[:, 3] n_features_all = results_libsvm[:, 4] fit_time_online_all = results_online[:, 0] predict_time_online_all = results_online[:, 1] auc_online_all = results_online[:, 2] width = 0.7 ind = 2 * np.arange(len(datasets)) x_tickslabels = [ (name + "\n" + r"$n={:,d}$" + "\n" + r"$d={:d}$").format(int(n), int(d)) for name, n, d in zip(datasets, n_train_all, n_features_all) ] def autolabel_auc(rects, ax): """Attach a text label above each bar displaying its height.""" for rect in rects: height = rect.get_height() ax.text( rect.get_x() + rect.get_width() / 2.0, 1.05 * height, "%.3f" % height, ha="center", va="bottom", ) def autolabel_time(rects, ax): """Attach a text label above each bar displaying its height.""" for rect in rects: height = rect.get_height() ax.text( rect.get_x() + rect.get_width() / 2.0, 1.05 * height, "%.1f" % height, ha="center", va="bottom", ) fig, ax = plt.subplots(figsize=(15, 8)) ax.set_ylabel("AUC") ax.set_ylim((0, 1.3)) rect_libsvm = ax.bar(ind, auc_libsvm_all, width=width, color="r") rect_online = ax.bar(ind + width, auc_online_all, width=width, color="y") ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) ax.set_xticks(ind + width / 2) ax.set_xticklabels(x_tickslabels) autolabel_auc(rect_libsvm, ax) autolabel_auc(rect_online, ax) plt.show() fig, ax = plt.subplots(figsize=(15, 8)) ax.set_ylabel("Training time (sec) - Log scale") ax.set_yscale("log") rect_libsvm = ax.bar(ind, fit_time_libsvm_all, color="r", width=width) rect_online = ax.bar(ind + width, fit_time_online_all, color="y", width=width) ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) ax.set_xticks(ind + width / 2) ax.set_xticklabels(x_tickslabels) autolabel_time(rect_libsvm, ax) autolabel_time(rect_online, ax) plt.show() fig, ax = plt.subplots(figsize=(15, 8)) ax.set_ylabel("Testing time (sec) - Log scale") ax.set_yscale("log") rect_libsvm = ax.bar(ind, predict_time_libsvm_all, color="r", width=width) rect_online = ax.bar(ind + width, predict_time_online_all, color="y", width=width) ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) ax.set_xticks(ind + width / 2) ax.set_xticklabels(x_tickslabels) autolabel_time(rect_libsvm, ax) autolabel_time(rect_online, ax) plt.show() ================================================ FILE: benchmarks/bench_plot_fastkmeans.py ================================================ from collections import defaultdict from time import time import numpy as np from numpy import random as nr from sklearn.cluster import KMeans, MiniBatchKMeans def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print("==============================") print("Iteration %03d of %03d" % (it, max_it)) print("==============================") print() data = nr.randint(-50, 51, (n_samples, n_features)) print("K-Means") tstart = time() kmeans = KMeans(init="k-means++", n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.5f" % kmeans.inertia_) print() results["kmeans_speed"].append(delta) results["kmeans_quality"].append(kmeans.inertia_) print("Fast K-Means") # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans( init="k-means++", n_clusters=10, batch_size=chunk ) tstart = time() mbkmeans.fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %f" % mbkmeans.inertia_) print() print() results["MiniBatchKMeans Speed"].append(delta) results["MiniBatchKMeans Quality"].append(mbkmeans.inertia_) return results def compute_bench_2(chunks): results = defaultdict(lambda: []) n_features = 50000 means = np.array( [ [1, 1], [-1, -1], [1, -1], [-1, 1], [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0], ] ) X = np.empty((0, 2)) for i in range(8): X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] max_it = len(chunks) it = 0 for chunk in chunks: it += 1 print("==============================") print("Iteration %03d of %03d" % (it, max_it)) print("==============================") print() print("Fast K-Means") tstart = time() mbkmeans = MiniBatchKMeans(init="k-means++", n_clusters=8, batch_size=chunk) mbkmeans.fit(X) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.3fs" % mbkmeans.inertia_) print() results["MiniBatchKMeans Speed"].append(delta) results["MiniBatchKMeans Quality"].append(mbkmeans.inertia_) return results if __name__ == "__main__": from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(50, 150, 5).astype(int) features_range = np.linspace(150, 50000, 5).astype(int) chunks = np.linspace(500, 10000, 15).astype(int) results = compute_bench(samples_range, features_range) results_2 = compute_bench_2(chunks) max_time = max( [max(i) for i in [t for (label, t) in results.items() if "speed" in label]] ) max_inertia = max( [max(i) for i in [t for (label, t) in results.items() if "speed" not in label]] ) fig = plt.figure("scikit-learn K-Means benchmark results") for c, (label, timings) in zip("brcy", sorted(results.items())): if "speed" in label: ax = fig.add_subplot(2, 2, 1, projection="3d") ax.set_zlim3d(0.0, max_time * 1.1) else: ax = fig.add_subplot(2, 2, 2, projection="3d") ax.set_zlim3d(0.0, max_inertia * 1.1) X, Y = np.meshgrid(samples_range, features_range) Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5) ax.set_xlabel("n_samples") ax.set_ylabel("n_features") i = 0 for c, (label, timings) in zip("br", sorted(results_2.items())): i += 1 ax = fig.add_subplot(2, 2, i + 2) y = np.asarray(timings) ax.plot(chunks, y, color=c, alpha=0.8) ax.set_xlabel("Chunks") ax.set_ylabel(label) plt.show() ================================================ FILE: benchmarks/bench_plot_hierarchical.py ================================================ from collections import defaultdict from time import time import numpy as np from numpy import random as nr from sklearn.cluster import AgglomerativeClustering def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print("==============================") print("Iteration %03d of %03d" % (it, max_it)) print("n_samples %05d; n_features %02d" % (n_samples, n_features)) print("==============================") print() data = nr.randint(-50, 51, (n_samples, n_features)) for linkage in ("single", "average", "complete", "ward"): print(linkage.capitalize()) tstart = time() AgglomerativeClustering(linkage=linkage, n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print() results[linkage].append(delta) return results if __name__ == "__main__": import matplotlib.pyplot as plt samples_range = np.linspace(1000, 15000, 8).astype(int) features_range = np.array([2, 10, 20, 50]) results = compute_bench(samples_range, features_range) max_time = max([max(i) for i in [t for (label, t) in results.items()]]) colors = plt.get_cmap("tab10")(np.linspace(0, 1, 10))[:4] lines = {linkage: None for linkage in results.keys()} fig, axs = plt.subplots(2, 2, sharex=True, sharey=True) fig.suptitle("Scikit-learn agglomerative clustering benchmark results", fontsize=16) for c, (label, timings) in zip(colors, sorted(results.items())): timing_by_samples = np.asarray(timings).reshape( samples_range.shape[0], features_range.shape[0] ) for n in range(timing_by_samples.shape[1]): ax = axs.flatten()[n] (lines[label],) = ax.plot( samples_range, timing_by_samples[:, n], color=c, label=label ) ax.set_title("n_features = %d" % features_range[n]) if n >= 2: ax.set_xlabel("n_samples") if n % 2 == 0: ax.set_ylabel("time (s)") fig.subplots_adjust(right=0.8) fig.legend( [lines[link] for link in sorted(results.keys())], sorted(results.keys()), loc="center right", fontsize=8, ) plt.show() ================================================ FILE: benchmarks/bench_plot_incremental_pca.py ================================================ """ ======================== IncrementalPCA benchmark ======================== Benchmarks for IncrementalPCA """ import numpy as np import gc from time import time from collections import defaultdict import matplotlib.pyplot as plt from sklearn.datasets import fetch_lfw_people from sklearn.decomposition import IncrementalPCA, PCA def plot_results(X, y, label): plt.plot(X, y, label=label, marker="o") def benchmark(estimator, data): gc.collect() print("Benching %s" % estimator) t0 = time() estimator.fit(data) training_time = time() - t0 data_t = estimator.transform(data) data_r = estimator.inverse_transform(data_t) reconstruction_error = np.mean(np.abs(data - data_r)) return {"time": training_time, "error": reconstruction_error} def plot_feature_times(all_times, batch_size, all_components, data): plt.figure() plot_results(all_components, all_times["pca"], label="PCA") plot_results( all_components, all_times["ipca"], label="IncrementalPCA, bsize=%i" % batch_size ) plt.legend(loc="upper left") plt.suptitle( "Algorithm runtime vs. n_components\n LFW, size %i x %i" % data.shape ) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Time (seconds)") def plot_feature_errors(all_errors, batch_size, all_components, data): plt.figure() plot_results(all_components, all_errors["pca"], label="PCA") plot_results( all_components, all_errors["ipca"], label="IncrementalPCA, bsize=%i" % batch_size, ) plt.legend(loc="lower left") plt.suptitle("Algorithm error vs. n_components\nLFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Mean absolute error") def plot_batch_times(all_times, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_times["pca"], label="PCA") plot_results(all_batch_sizes, all_times["ipca"], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle( "Algorithm runtime vs. batch_size for n_components %i\n LFW," " size %i x %i" % (n_features, data.shape[0], data.shape[1]) ) plt.xlabel("Batch size") plt.ylabel("Time (seconds)") def plot_batch_errors(all_errors, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_errors["pca"], label="PCA") plot_results(all_batch_sizes, all_errors["ipca"], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle( "Algorithm error vs. batch_size for n_components %i\n LFW," " size %i x %i" % (n_features, data.shape[0], data.shape[1]) ) plt.xlabel("Batch size") plt.ylabel("Mean absolute error") def fixed_batch_size_comparison(data): all_features = [ i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=5) ] batch_size = 1000 # Compare runtimes and error for fixed batch size all_times = defaultdict(list) all_errors = defaultdict(list) for n_components in all_features: pca = PCA(n_components=n_components) ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = { k: benchmark(est, data) for k, est in [("pca", pca), ("ipca", ipca)] } for k in sorted(results_dict.keys()): all_times[k].append(results_dict[k]["time"]) all_errors[k].append(results_dict[k]["error"]) plot_feature_times(all_times, batch_size, all_features, data) plot_feature_errors(all_errors, batch_size, all_features, data) def variable_batch_size_comparison(data): batch_sizes = [ i.astype(int) for i in np.linspace(data.shape[0] // 10, data.shape[0], num=10) ] for n_components in [ i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=4) ]: all_times = defaultdict(list) all_errors = defaultdict(list) pca = PCA(n_components=n_components) rpca = PCA( n_components=n_components, svd_solver="randomized", random_state=1999 ) results_dict = { k: benchmark(est, data) for k, est in [("pca", pca), ("rpca", rpca)] } # Create flat baselines to compare the variation over batch size all_times["pca"].extend([results_dict["pca"]["time"]] * len(batch_sizes)) all_errors["pca"].extend([results_dict["pca"]["error"]] * len(batch_sizes)) all_times["rpca"].extend([results_dict["rpca"]["time"]] * len(batch_sizes)) all_errors["rpca"].extend([results_dict["rpca"]["error"]] * len(batch_sizes)) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = {k: benchmark(est, data) for k, est in [("ipca", ipca)]} all_times["ipca"].append(results_dict["ipca"]["time"]) all_errors["ipca"].append(results_dict["ipca"]["error"]) plot_batch_times(all_times, n_components, batch_sizes, data) plot_batch_errors(all_errors, n_components, batch_sizes, data) faces = fetch_lfw_people(resize=0.2, min_faces_per_person=5) # limit dataset to 5000 people (don't care who they are!) X = faces.data[:5000] n_samples, h, w = faces.images.shape n_features = X.shape[1] X -= X.mean(axis=0) X /= X.std(axis=0) fixed_batch_size_comparison(X) variable_batch_size_comparison(X) plt.show() ================================================ FILE: benchmarks/bench_plot_lasso_path.py ================================================ """Benchmarks of Lasso regularization path computation using Lars and CD The input data is mostly low rank but is a fat infinite tail. """ from collections import defaultdict import gc import sys from time import time import numpy as np from sklearn.linear_model import lars_path, lars_path_gram from sklearn.linear_model import lasso_path from sklearn.datasets import make_regression def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print("====================") print("Iteration %03d of %03d" % (it, max_it)) print("====================") dataset_kwargs = { "n_samples": n_samples, "n_features": n_features, "n_informative": n_features // 10, "effective_rank": min(n_samples, n_features) / 10, # 'effective_rank': None, "bias": 0.0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) X, y = make_regression(**dataset_kwargs) gc.collect() print("benchmarking lars_path (with Gram):", end="") sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, method="lasso") delta = time() - tstart print("%0.3fs" % delta) results["lars_path (with Gram)"].append(delta) gc.collect() print("benchmarking lars_path (without Gram):", end="") sys.stdout.flush() tstart = time() lars_path(X, y, method="lasso") delta = time() - tstart print("%0.3fs" % delta) results["lars_path (without Gram)"].append(delta) gc.collect() print("benchmarking lasso_path (with Gram):", end="") sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=True) delta = time() - tstart print("%0.3fs" % delta) results["lasso_path (with Gram)"].append(delta) gc.collect() print("benchmarking lasso_path (without Gram):", end="") sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=False) delta = time() - tstart print("%0.3fs" % delta) results["lasso_path (without Gram)"].append(delta) return results if __name__ == "__main__": from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(10, 2000, 5).astype(int) features_range = np.linspace(10, 2000, 5).astype(int) results = compute_bench(samples_range, features_range) max_time = max(max(t) for t in results.values()) fig = plt.figure("scikit-learn Lasso path benchmark results") i = 1 for c, (label, timings) in zip("bcry", sorted(results.items())): ax = fig.add_subplot(2, 2, i, projection="3d") X, Y = np.meshgrid(samples_range, features_range) Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) # plot the actual surface ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.8) # dummy point plot to stick the legend to since surface plot do not # support legends (yet?) # ax.plot([1], [1], [1], color=c, label=label) ax.set_xlabel("n_samples") ax.set_ylabel("n_features") ax.set_zlabel("Time (s)") ax.set_zlim3d(0.0, max_time * 1.1) ax.set_title(label) # ax.legend() i += 1 plt.show() ================================================ FILE: benchmarks/bench_plot_neighbors.py ================================================ """ Plot the scaling of the nearest neighbors algorithms with k, D, and N """ from time import time import numpy as np import matplotlib.pyplot as plt from matplotlib import ticker from sklearn import neighbors, datasets def get_data(N, D, dataset="dense"): if dataset == "dense": np.random.seed(0) return np.random.random((N, D)) elif dataset == "digits": X, _ = datasets.load_digits(return_X_y=True) i = np.argsort(X[0])[::-1] X = X[:, i] return X[:N, :D] else: raise ValueError("invalid dataset: %s" % dataset) def barplot_neighbors( Nrange=2 ** np.arange(1, 11), Drange=2 ** np.arange(7), krange=2 ** np.arange(10), N=1000, D=64, k=5, leaf_size=30, dataset="digits", ): algorithms = ("kd_tree", "brute", "ball_tree") fiducial_values = {"N": N, "D": D, "k": k} # ------------------------------------------------------------ # varying N N_results_build = {alg: np.zeros(len(Nrange)) for alg in algorithms} N_results_query = {alg: np.zeros(len(Nrange)) for alg in algorithms} for i, NN in enumerate(Nrange): print("N = %i (%i out of %i)" % (NN, i + 1, len(Nrange))) X = get_data(NN, D, dataset) for algorithm in algorithms: nbrs = neighbors.NearestNeighbors( n_neighbors=min(NN, k), algorithm=algorithm, leaf_size=leaf_size ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() N_results_build[algorithm][i] = t1 - t0 N_results_query[algorithm][i] = t2 - t1 # ------------------------------------------------------------ # varying D D_results_build = {alg: np.zeros(len(Drange)) for alg in algorithms} D_results_query = {alg: np.zeros(len(Drange)) for alg in algorithms} for i, DD in enumerate(Drange): print("D = %i (%i out of %i)" % (DD, i + 1, len(Drange))) X = get_data(N, DD, dataset) for algorithm in algorithms: nbrs = neighbors.NearestNeighbors( n_neighbors=k, algorithm=algorithm, leaf_size=leaf_size ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() D_results_build[algorithm][i] = t1 - t0 D_results_query[algorithm][i] = t2 - t1 # ------------------------------------------------------------ # varying k k_results_build = {alg: np.zeros(len(krange)) for alg in algorithms} k_results_query = {alg: np.zeros(len(krange)) for alg in algorithms} X = get_data(N, DD, dataset) for i, kk in enumerate(krange): print("k = %i (%i out of %i)" % (kk, i + 1, len(krange))) for algorithm in algorithms: nbrs = neighbors.NearestNeighbors( n_neighbors=kk, algorithm=algorithm, leaf_size=leaf_size ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() k_results_build[algorithm][i] = t1 - t0 k_results_query[algorithm][i] = t2 - t1 plt.figure(figsize=(8, 11)) for (sbplt, vals, quantity, build_time, query_time) in [ (311, Nrange, "N", N_results_build, N_results_query), (312, Drange, "D", D_results_build, D_results_query), (313, krange, "k", k_results_build, k_results_query), ]: ax = plt.subplot(sbplt, yscale="log") plt.grid(True) tick_vals = [] tick_labels = [] bottom = 10 ** np.min( [min(np.floor(np.log10(build_time[alg]))) for alg in algorithms] ) for i, alg in enumerate(algorithms): xvals = 0.1 + i * (1 + len(vals)) + np.arange(len(vals)) width = 0.8 c_bar = plt.bar(xvals, build_time[alg] - bottom, width, bottom, color="r") q_bar = plt.bar(xvals, query_time[alg], width, build_time[alg], color="b") tick_vals += list(xvals + 0.5 * width) tick_labels += ["%i" % val for val in vals] plt.text( (i + 0.02) / len(algorithms), 0.98, alg, transform=ax.transAxes, ha="left", va="top", bbox=dict(facecolor="w", edgecolor="w", alpha=0.5), ) plt.ylabel("Time (s)") ax.xaxis.set_major_locator(ticker.FixedLocator(tick_vals)) ax.xaxis.set_major_formatter(ticker.FixedFormatter(tick_labels)) for label in ax.get_xticklabels(): label.set_rotation(-90) label.set_fontsize(10) title_string = "Varying %s" % quantity descr_string = "" for s in "NDk": if s == quantity: pass else: descr_string += "%s = %i, " % (s, fiducial_values[s]) descr_string = descr_string[:-2] plt.text( 1.01, 0.5, title_string, transform=ax.transAxes, rotation=-90, ha="left", va="center", fontsize=20, ) plt.text( 0.99, 0.5, descr_string, transform=ax.transAxes, rotation=-90, ha="right", va="center", ) plt.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16) plt.figlegend((c_bar, q_bar), ("construction", "N-point query"), "upper right") if __name__ == "__main__": barplot_neighbors(dataset="digits") barplot_neighbors(dataset="dense") plt.show() ================================================ FILE: benchmarks/bench_plot_nmf.py ================================================ """ Benchmarks of Non-Negative Matrix Factorization """ # Authors: Tom Dupre la Tour (benchmark) # Chih-Jen Linn (original projected gradient NMF implementation) # Anthony Di Franco (projected gradient, Python and NumPy port) # License: BSD 3 clause from time import time import sys import warnings import numbers import numpy as np import matplotlib.pyplot as plt from joblib import Memory import pandas from sklearn.utils._testing import ignore_warnings from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from sklearn.decomposition._nmf import _initialize_nmf from sklearn.decomposition._nmf import _beta_divergence from sklearn.decomposition._nmf import _check_init from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted, check_non_negative mem = Memory(cachedir=".", verbose=0) ################### # Start of _PGNMF # ################### # This class implements a projected gradient solver for the NMF. # The projected gradient solver was removed from scikit-learn in version 0.19, # and a simplified copy is used here for comparison purpose only. # It is not tested, and it may change or disappear without notice. def _norm(x): """Dot product-based Euclidean norm implementation See: http://fseoane.net/blog/2011/computing-the-vector-norm/ """ return np.sqrt(squared_norm(x)) def _nls_subproblem( X, W, H, tol, max_iter, alpha=0.0, l1_ratio=0.0, sigma=0.01, beta=0.1 ): """Non-negative least square solver Solves a non-negative least squares subproblem using the projected gradient descent algorithm. Parameters ---------- X : array-like, shape (n_samples, n_features) Constant matrix. W : array-like, shape (n_samples, n_components) Constant matrix. H : array-like, shape (n_components, n_features) Initial guess for the solution. tol : float Tolerance of the stopping condition. max_iter : int Maximum number of iterations before timing out. alpha : double, default: 0. Constant that multiplies the regularization terms. Set it to zero to have no regularization. l1_ratio : double, default: 0. The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. sigma : float Constant used in the sufficient decrease condition checked by the line search. Smaller values lead to a looser sufficient decrease condition, thus reducing the time taken by the line search, but potentially increasing the number of iterations of the projected gradient procedure. 0.01 is a commonly used value in the optimization literature. beta : float Factor by which the step size is decreased (resp. increased) until (resp. as long as) the sufficient decrease condition is satisfied. Larger values allow to find a better step size but lead to longer line search. 0.1 is a commonly used value in the optimization literature. Returns ------- H : array-like, shape (n_components, n_features) Solution to the non-negative least squares problem. grad : array-like, shape (n_components, n_features) The gradient. n_iter : int The number of iterations done by the algorithm. References ---------- C.-J. Lin. Projected gradient methods for non-negative matrix factorization. Neural Computation, 19(2007), 2756-2779. https://www.csie.ntu.edu.tw/~cjlin/nmf/ """ WtX = safe_sparse_dot(W.T, X) WtW = np.dot(W.T, W) # values justified in the paper (alpha is renamed gamma) gamma = 1 for n_iter in range(1, max_iter + 1): grad = np.dot(WtW, H) - WtX if alpha > 0 and l1_ratio == 1.0: grad += alpha elif alpha > 0: grad += alpha * (l1_ratio + (1 - l1_ratio) * H) # The following multiplication with a boolean array is more than twice # as fast as indexing into grad. if _norm(grad * np.logical_or(grad < 0, H > 0)) < tol: break Hp = H for inner_iter in range(20): # Gradient step. Hn = H - gamma * grad # Projection step. Hn *= Hn > 0 d = Hn - H gradd = np.dot(grad.ravel(), d.ravel()) dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel()) suff_decr = (1 - sigma) * gradd + 0.5 * dQd < 0 if inner_iter == 0: decr_gamma = not suff_decr if decr_gamma: if suff_decr: H = Hn break else: gamma *= beta elif not suff_decr or (Hp == Hn).all(): H = Hp break else: gamma /= beta Hp = Hn if n_iter == max_iter: warnings.warn("Iteration limit reached in nls subproblem.", ConvergenceWarning) return H, grad, n_iter def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio): gradW = np.dot(W, np.dot(H, H.T)) - safe_sparse_dot(X, H.T, dense_output=True) gradH = np.dot(np.dot(W.T, W), H) - safe_sparse_dot(W.T, X, dense_output=True) init_grad = squared_norm(gradW) + squared_norm(gradH.T) # max(0.001, tol) to force alternating minimizations of W and H tolW = max(0.001, tol) * np.sqrt(init_grad) tolH = tolW for n_iter in range(1, max_iter + 1): # stopping condition as discussed in paper proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0)) proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0)) if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2: break # update W Wt, gradWt, iterW = _nls_subproblem( X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio ) W, gradW = Wt.T, gradWt.T if iterW == 1: tolW = 0.1 * tolW # update H H, gradH, iterH = _nls_subproblem( X, W, H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio ) if iterH == 1: tolH = 0.1 * tolH H[H == 0] = 0 # fix up negative zeros if n_iter == max_iter: Wt, _, _ = _nls_subproblem( X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio ) W = Wt.T return W, H, n_iter class _PGNMF(NMF): """Non-Negative Matrix Factorization (NMF) with projected gradient solver. This class is private and for comparison purpose only. It may change or disappear without notice. """ def __init__( self, n_components=None, solver="pg", init=None, tol=1e-4, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, nls_max_iter=10, ): super().__init__( n_components=n_components, init=init, solver=solver, tol=tol, max_iter=max_iter, random_state=random_state, alpha=alpha, l1_ratio=l1_ratio, ) self.nls_max_iter = nls_max_iter def fit(self, X, y=None, **params): self.fit_transform(X, **params) return self def transform(self, X): check_is_fitted(self) H = self.components_ W, _, self.n_iter_ = self._fit_transform(X, H=H, update_H=False) return W def inverse_transform(self, W): check_is_fitted(self) return np.dot(W, self.components_) def fit_transform(self, X, y=None, W=None, H=None): W, H, self.n_iter = self._fit_transform(X, W=W, H=H, update_H=True) self.components_ = H return W def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): X = check_array(X, accept_sparse=("csr", "csc")) check_non_negative(X, "NMF (input X)") n_samples, n_features = X.shape n_components = self.n_components if n_components is None: n_components = n_features if not isinstance(n_components, numbers.Integral) or n_components <= 0: raise ValueError( "Number of components must be a positive integer; got (n_components=%r)" % n_components ) if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: raise ValueError( "Maximum number of iterations must be a positive " "integer; got (max_iter=%r)" % self.max_iter ) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError( "Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol ) # check W and H, or initialize them if self.init == "custom" and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") W = np.zeros((n_samples, n_components)) else: W, H = _initialize_nmf( X, n_components, init=self.init, random_state=self.random_state ) if update_H: # fit_transform W, H, n_iter = _fit_projected_gradient( X, W, H, self.tol, self.max_iter, self.nls_max_iter, self.alpha, self.l1_ratio, ) else: # transform Wt, _, n_iter = _nls_subproblem( X.T, H.T, W.T, self.tol, self.nls_max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio, ) W = Wt.T if n_iter == self.max_iter and self.tol > 0: warnings.warn( "Maximum number of iteration %d reached. Increase it" " to improve convergence." % self.max_iter, ConvergenceWarning, ) return W, H, n_iter ################# # End of _PGNMF # ################# def plot_results(results_df, plot_name): if results_df is None: return None plt.figure(figsize=(16, 6)) colors = "bgr" markers = "ovs" ax = plt.subplot(1, 3, 1) for i, init in enumerate(np.unique(results_df["init"])): plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax) for j, method in enumerate(np.unique(results_df["method"])): mask = np.logical_and( results_df["init"] == init, results_df["method"] == method ) selected_items = results_df[mask] plt.plot( selected_items["time"], selected_items["loss"], color=colors[j % len(colors)], ls="-", marker=markers[j % len(markers)], label=method, ) plt.legend(loc=0, fontsize="x-small") plt.xlabel("Time (s)") plt.ylabel("loss") plt.title("%s" % init) plt.suptitle(plot_name, fontsize=16) @ignore_warnings(category=ConvergenceWarning) # use joblib to cache the results. # X_shape is specified in arguments for avoiding hashing X @mem.cache(ignore=["X", "W0", "H0"]) def bench_one( name, X, W0, H0, X_shape, clf_type, clf_params, init, n_components, random_state ): W = W0.copy() H = H0.copy() clf = clf_type(**clf_params) st = time() W = clf.fit_transform(X, W=W, H=H) end = time() H = clf.components_ this_loss = _beta_divergence(X, W, H, 2.0, True) duration = end - st return this_loss, duration def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio): start = time() results = [] for name, clf_type, iter_range, clf_params in clfs: print("Training %s:" % name) for rs, init in enumerate(("nndsvd", "nndsvdar", "random")): print(" %s %s: " % (init, " " * (8 - len(init))), end="") W, H = _initialize_nmf(X, n_components, init, 1e-6, rs) for max_iter in iter_range: clf_params["alpha"] = alpha clf_params["l1_ratio"] = l1_ratio clf_params["max_iter"] = max_iter clf_params["tol"] = tol clf_params["random_state"] = rs clf_params["init"] = "custom" clf_params["n_components"] = n_components this_loss, duration = bench_one( name, X, W, H, X.shape, clf_type, clf_params, init, n_components, rs ) init_name = "init='%s'" % init results.append((name, this_loss, duration, init_name)) # print("loss: %.6f, time: %.3f sec" % (this_loss, duration)) print(".", end="") sys.stdout.flush() print(" ") # Use a panda dataframe to organize the results results_df = pandas.DataFrame(results, columns="method loss time init".split()) print("Total time = %0.3f sec\n" % (time() - start)) # plot the results plot_results(results_df, plot_name) return results_df def load_20news(): print("Loading 20 newsgroups dataset") print("-----------------------------") from sklearn.datasets import fetch_20newsgroups dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=("headers", "footers", "quotes") ) vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english") tfidf = vectorizer.fit_transform(dataset.data) return tfidf def load_faces(): print("Loading Olivetti face dataset") print("-----------------------------") from sklearn.datasets import fetch_olivetti_faces faces = fetch_olivetti_faces(shuffle=True) return faces.data def build_clfs(cd_iters, pg_iters, mu_iters): clfs = [ ("Coordinate Descent", NMF, cd_iters, {"solver": "cd"}), ("Projected Gradient", _PGNMF, pg_iters, {"solver": "pg"}), ("Multiplicative Update", NMF, mu_iters, {"solver": "mu"}), ] return clfs if __name__ == "__main__": alpha = 0.0 l1_ratio = 0.5 n_components = 10 tol = 1e-15 # first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116) plot_name = "20 Newsgroups sparse dataset" cd_iters = np.arange(1, 30) pg_iters = np.arange(1, 6) mu_iters = np.arange(1, 30) clfs = build_clfs(cd_iters, pg_iters, mu_iters) X_20news = load_20news() run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio) # second benchmark on Olivetti faces dataset: dense, shape(400, 4096) plot_name = "Olivetti Faces dense dataset" cd_iters = np.arange(1, 30) pg_iters = np.arange(1, 12) mu_iters = np.arange(1, 30) clfs = build_clfs(cd_iters, pg_iters, mu_iters) X_faces = load_faces() run_bench( X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio, ) plt.show() ================================================ FILE: benchmarks/bench_plot_omp_lars.py ================================================ """Benchmarks of orthogonal matching pursuit (:ref:`OMP`) versus least angle regression (:ref:`least_angle_regression`) The input data is mostly low rank but is a fat infinite tail. """ import gc import sys from time import time import numpy as np from sklearn.linear_model import lars_path, lars_path_gram, orthogonal_mp from sklearn.datasets import make_sparse_coded_signal def compute_bench(samples_range, features_range): it = 0 results = dict() lars = np.empty((len(features_range), len(samples_range))) lars_gram = lars.copy() omp = lars.copy() omp_gram = lars.copy() max_it = len(samples_range) * len(features_range) for i_s, n_samples in enumerate(samples_range): for i_f, n_features in enumerate(features_range): it += 1 n_informative = n_features / 10 print("====================") print("Iteration %03d of %03d" % (it, max_it)) print("====================") # dataset_kwargs = { # 'n_train_samples': n_samples, # 'n_test_samples': 2, # 'n_features': n_features, # 'n_informative': n_informative, # 'effective_rank': min(n_samples, n_features) / 10, # #'effective_rank': None, # 'bias': 0.0, # } dataset_kwargs = { "n_samples": 1, "n_components": n_features, "n_features": n_samples, "n_nonzero_coefs": n_informative, "random_state": 0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) y, X, _ = make_sparse_coded_signal(**dataset_kwargs) X = np.asfortranarray(X) gc.collect() print("benchmarking lars_path (with Gram):", end="") sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, max_iter=n_informative) delta = time() - tstart print("%0.3fs" % delta) lars_gram[i_f, i_s] = delta gc.collect() print("benchmarking lars_path (without Gram):", end="") sys.stdout.flush() tstart = time() lars_path(X, y, Gram=None, max_iter=n_informative) delta = time() - tstart print("%0.3fs" % delta) lars[i_f, i_s] = delta gc.collect() print("benchmarking orthogonal_mp (with Gram):", end="") sys.stdout.flush() tstart = time() orthogonal_mp(X, y, precompute=True, n_nonzero_coefs=n_informative) delta = time() - tstart print("%0.3fs" % delta) omp_gram[i_f, i_s] = delta gc.collect() print("benchmarking orthogonal_mp (without Gram):", end="") sys.stdout.flush() tstart = time() orthogonal_mp(X, y, precompute=False, n_nonzero_coefs=n_informative) delta = time() - tstart print("%0.3fs" % delta) omp[i_f, i_s] = delta results["time(LARS) / time(OMP)\n (w/ Gram)"] = lars_gram / omp_gram results["time(LARS) / time(OMP)\n (w/o Gram)"] = lars / omp return results if __name__ == "__main__": samples_range = np.linspace(1000, 5000, 5).astype(int) features_range = np.linspace(1000, 5000, 5).astype(int) results = compute_bench(samples_range, features_range) max_time = max(np.max(t) for t in results.values()) import matplotlib.pyplot as plt fig = plt.figure("scikit-learn OMP vs. LARS benchmark results") for i, (label, timings) in enumerate(sorted(results.items())): ax = fig.add_subplot(1, 2, i + 1) vmax = max(1 - timings.min(), -1 + timings.max()) plt.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax) ax.set_xticklabels([""] + [str(each) for each in samples_range]) ax.set_yticklabels([""] + [str(each) for each in features_range]) plt.xlabel("n_samples") plt.ylabel("n_features") plt.title(label) plt.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63) ax = plt.axes([0.1, 0.08, 0.8, 0.06]) plt.colorbar(cax=ax, orientation="horizontal") plt.show() ================================================ FILE: benchmarks/bench_plot_parallel_pairwise.py ================================================ # Author: Mathieu Blondel # License: BSD 3 clause import time import matplotlib.pyplot as plt from sklearn.utils import check_random_state from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.pairwise import pairwise_kernels def plot(func): random_state = check_random_state(0) one_core = [] multi_core = [] sample_sizes = range(1000, 6000, 1000) for n_samples in sample_sizes: X = random_state.rand(n_samples, 300) start = time.time() func(X, n_jobs=1) one_core.append(time.time() - start) start = time.time() func(X, n_jobs=-1) multi_core.append(time.time() - start) plt.figure("scikit-learn parallel %s benchmark results" % func.__name__) plt.plot(sample_sizes, one_core, label="one core") plt.plot(sample_sizes, multi_core, label="multi core") plt.xlabel("n_samples") plt.ylabel("Time (s)") plt.title("Parallel %s" % func.__name__) plt.legend() def euclidean_distances(X, n_jobs): return pairwise_distances(X, metric="euclidean", n_jobs=n_jobs) def rbf_kernels(X, n_jobs): return pairwise_kernels(X, metric="rbf", n_jobs=n_jobs, gamma=0.1) plot(euclidean_distances) plot(rbf_kernels) plt.show() ================================================ FILE: benchmarks/bench_plot_polynomial_kernel_approximation.py ================================================ """ ======================================================================== Benchmark for explicit feature map approximation of polynomial kernels ======================================================================== An example illustrating the approximation of the feature map of an Homogeneous Polynomial kernel. .. currentmodule:: sklearn.kernel_approximation It shows how to use :class:`PolynomialCountSketch` and :class:`Nystroem` to approximate the feature map of a polynomial kernel for classification with an SVM on the digits dataset. Results using a linear SVM in the original space, a linear SVM using the approximate mappings and a kernelized SVM are compared. The first plot shows the classification accuracy of Nystroem [2] and PolynomialCountSketch [1] as the output dimension (n_components) grows. It also shows the accuracy of a linear SVM and a polynomial kernel SVM on the same data. The second plot explores the scalability of PolynomialCountSketch and Nystroem. For a sufficiently large output dimension, PolynomialCountSketch should be faster as it is O(n(d+klog k)) while Nystroem is O(n(dk+k^2)). In addition, Nystroem requires a time-consuming training phase, while training is almost immediate for PolynomialCountSketch, whose training phase boils down to initializing some random variables (because is data-independent). [1] Pham, N., & Pagh, R. (2013, August). Fast and scalable polynomial kernels via explicit feature maps. In Proceedings of the 19th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 239-247) (http://chbrown.github.io/kdd-2013-usb/kdd/p239.pdf) [2] Charikar, M., Chen, K., & Farach-Colton, M. (2002, July). Finding frequent items in data streams. In International Colloquium on Automata, Languages, and Programming (pp. 693-703). Springer, Berlin, Heidelberg. (http://www.vldb.org/pvldb/1/1454225.pdf) """ # Author: Daniel Lopez-Sanchez # License: BSD 3 clause # Load data manipulation functions from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split # Some common libraries import matplotlib.pyplot as plt import numpy as np # Will use this for timing results from time import time # Import SVM classifiers and feature map approximation algorithms from sklearn.svm import LinearSVC, SVC from sklearn.kernel_approximation import Nystroem, PolynomialCountSketch from sklearn.pipeline import Pipeline # Split data in train and test sets X, y = load_digits()["data"], load_digits()["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) # Set the range of n_components for our experiments out_dims = range(20, 400, 20) # Evaluate Linear SVM lsvm = LinearSVC().fit(X_train, y_train) lsvm_score = 100 * lsvm.score(X_test, y_test) # Evaluate kernelized SVM ksvm = SVC(kernel="poly", degree=2, gamma=1.0).fit(X_train, y_train) ksvm_score = 100 * ksvm.score(X_test, y_test) # Evaluate PolynomialCountSketch + LinearSVM ps_svm_scores = [] n_runs = 5 # To compensate for the stochasticity of the method, we make n_tets runs for k in out_dims: score_avg = 0 for _ in range(n_runs): ps_svm = Pipeline( [ ("PS", PolynomialCountSketch(degree=2, n_components=k)), ("SVM", LinearSVC()), ] ) score_avg += ps_svm.fit(X_train, y_train).score(X_test, y_test) ps_svm_scores.append(100 * score_avg / n_runs) # Evaluate Nystroem + LinearSVM ny_svm_scores = [] n_runs = 5 for k in out_dims: score_avg = 0 for _ in range(n_runs): ny_svm = Pipeline( [ ( "NY", Nystroem( kernel="poly", gamma=1.0, degree=2, coef0=0, n_components=k ), ), ("SVM", LinearSVC()), ] ) score_avg += ny_svm.fit(X_train, y_train).score(X_test, y_test) ny_svm_scores.append(100 * score_avg / n_runs) # Show results fig, ax = plt.subplots(figsize=(6, 4)) ax.set_title("Accuracy results") ax.plot(out_dims, ps_svm_scores, label="PolynomialCountSketch + linear SVM", c="orange") ax.plot(out_dims, ny_svm_scores, label="Nystroem + linear SVM", c="blue") ax.plot( [out_dims[0], out_dims[-1]], [lsvm_score, lsvm_score], label="Linear SVM", c="black", dashes=[2, 2], ) ax.plot( [out_dims[0], out_dims[-1]], [ksvm_score, ksvm_score], label="Poly-kernel SVM", c="red", dashes=[2, 2], ) ax.legend() ax.set_xlabel("N_components for PolynomialCountSketch and Nystroem") ax.set_ylabel("Accuracy (%)") ax.set_xlim([out_dims[0], out_dims[-1]]) fig.tight_layout() # Now lets evaluate the scalability of PolynomialCountSketch vs Nystroem # First we generate some fake data with a lot of samples fakeData = np.random.randn(10000, 100) fakeDataY = np.random.randint(0, high=10, size=(10000)) out_dims = range(500, 6000, 500) # Evaluate scalability of PolynomialCountSketch as n_components grows ps_svm_times = [] for k in out_dims: ps = PolynomialCountSketch(degree=2, n_components=k) start = time() ps.fit_transform(fakeData, None) ps_svm_times.append(time() - start) # Evaluate scalability of Nystroem as n_components grows # This can take a while due to the inefficient training phase ny_svm_times = [] for k in out_dims: ny = Nystroem(kernel="poly", gamma=1.0, degree=2, coef0=0, n_components=k) start = time() ny.fit_transform(fakeData, None) ny_svm_times.append(time() - start) # Show results fig, ax = plt.subplots(figsize=(6, 4)) ax.set_title("Scalability results") ax.plot(out_dims, ps_svm_times, label="PolynomialCountSketch", c="orange") ax.plot(out_dims, ny_svm_times, label="Nystroem", c="blue") ax.legend() ax.set_xlabel("N_components for PolynomialCountSketch and Nystroem") ax.set_ylabel("fit_transform time \n(s/10.000 samples)") ax.set_xlim([out_dims[0], out_dims[-1]]) fig.tight_layout() plt.show() ================================================ FILE: benchmarks/bench_plot_randomized_svd.py ================================================ """ Benchmarks on the power iterations phase in randomized SVD. We test on various synthetic and real datasets the effect of increasing the number of power iterations in terms of quality of approximation and running time. A number greater than 0 should help with noisy matrices, which are characterized by a slow spectral decay. We test several policy for normalizing the power iterations. Normalization is crucial to avoid numerical issues. The quality of the approximation is measured by the spectral norm discrepancy between the original input matrix and the reconstructed one (by multiplying the randomized_svd's outputs). The spectral norm is always equivalent to the largest singular value of a matrix. (3) justifies this choice. However, one can notice in these experiments that Frobenius and spectral norms behave very similarly in a qualitative sense. Therefore, we suggest to run these benchmarks with `enable_spectral_norm = False`, as Frobenius' is MUCH faster to compute. The benchmarks follow. (a) plot: time vs norm, varying number of power iterations data: many datasets goal: compare normalization policies and study how the number of power iterations affect time and norm (b) plot: n_iter vs norm, varying rank of data and number of components for randomized_SVD data: low-rank matrices on which we control the rank goal: study whether the rank of the matrix and the number of components extracted by randomized SVD affect "the optimal" number of power iterations (c) plot: time vs norm, varying datasets data: many datasets goal: compare default configurations We compare the following algorithms: - randomized_svd(..., power_iteration_normalizer='none') - randomized_svd(..., power_iteration_normalizer='LU') - randomized_svd(..., power_iteration_normalizer='QR') - randomized_svd(..., power_iteration_normalizer='auto') - fbpca.pca() from https://github.com/facebook/fbpca (if installed) Conclusion ---------- - n_iter=2 appears to be a good default value - power_iteration_normalizer='none' is OK if n_iter is small, otherwise LU gives similar errors to QR but is cheaper. That's what 'auto' implements. References ---------- (1) Finding structure with randomness: Stochastic algorithms for constructing approximate matrix decompositions Halko, et al., 2009 https://arxiv.org/abs/0909.4061 (2) A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert (3) An implementation of a randomized algorithm for principal component analysis A. Szlam et al. 2014 """ # Author: Giorgio Patrini import numpy as np import scipy as sp import matplotlib.pyplot as plt import gc import pickle from time import time from collections import defaultdict import os.path from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils import gen_batches from sklearn.utils.validation import check_random_state from sklearn.utils.extmath import randomized_svd from sklearn.datasets import make_low_rank_matrix, make_sparse_uncorrelated from sklearn.datasets import ( fetch_lfw_people, fetch_openml, fetch_20newsgroups_vectorized, fetch_olivetti_faces, fetch_rcv1, ) try: import fbpca fbpca_available = True except ImportError: fbpca_available = False # If this is enabled, tests are much slower and will crash with the large data enable_spectral_norm = False # TODO: compute approximate spectral norms with the power method as in # Estimating the largest eigenvalues by the power and Lanczos methods with # a random start, Jacek Kuczynski and Henryk Wozniakowski, SIAM Journal on # Matrix Analysis and Applications, 13 (4): 1094-1122, 1992. # This approximation is a very fast estimate of the spectral norm, but depends # on starting random vectors. # Determine when to switch to batch computation for matrix norms, # in case the reconstructed (dense) matrix is too large MAX_MEMORY = int(2e9) # The following datasets can be downloaded manually from: # CIFAR 10: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz # SVHN: http://ufldl.stanford.edu/housenumbers/train_32x32.mat CIFAR_FOLDER = "./cifar-10-batches-py/" SVHN_FOLDER = "./SVHN/" datasets = [ "low rank matrix", "lfw_people", "olivetti_faces", "20newsgroups", "mnist_784", "CIFAR", "a3a", "SVHN", "uncorrelated matrix", ] big_sparse_datasets = ["big sparse matrix", "rcv1"] def unpickle(file_name): with open(file_name, "rb") as fo: return pickle.load(fo, encoding="latin1")["data"] def handle_missing_dataset(file_folder): if not os.path.isdir(file_folder): print("%s file folder not found. Test skipped." % file_folder) return 0 def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == "lfw_people": X = fetch_lfw_people().data elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data elif dataset_name == "rcv1": X = fetch_rcv1().data elif dataset_name == "CIFAR": if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == "low rank matrix": X = make_low_rank_matrix( n_samples=500, n_features=int(1e4), effective_rank=100, tail_strength=0.5, random_state=random_state, ) elif dataset_name == "uncorrelated matrix": X, _ = make_sparse_uncorrelated( n_samples=500, n_features=10000, random_state=random_state ) elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name).data return X def plot_time_vs_s(time, norm, point_labels, title): plt.figure() colors = ["g", "b", "y"] for i, l in enumerate(sorted(norm.keys())): if l != "fbpca": plt.plot(time[l], norm[l], label=l, marker="o", c=colors.pop()) else: plt.plot(time[l], norm[l], label=l, marker="^", c="red") for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): plt.annotate( label, xy=(x, y), xytext=(0, -20), textcoords="offset points", ha="right", va="bottom", ) plt.legend(loc="upper right") plt.suptitle(title) plt.ylabel("norm discrepancy") plt.xlabel("running time [s]") def scatter_time_vs_s(time, norm, point_labels, title): plt.figure() size = 100 for i, l in enumerate(sorted(norm.keys())): if l != "fbpca": plt.scatter(time[l], norm[l], label=l, marker="o", c="b", s=size) for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): plt.annotate( label, xy=(x, y), xytext=(0, -80), textcoords="offset points", ha="right", arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), va="bottom", size=11, rotation=90, ) else: plt.scatter(time[l], norm[l], label=l, marker="^", c="red", s=size) for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): plt.annotate( label, xy=(x, y), xytext=(0, 30), textcoords="offset points", ha="right", arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), va="bottom", size=11, rotation=90, ) plt.legend(loc="best") plt.suptitle(title) plt.ylabel("norm discrepancy") plt.xlabel("running time [s]") def plot_power_iter_vs_s(power_iter, s, title): plt.figure() for l in sorted(s.keys()): plt.plot(power_iter, s[l], label=l, marker="o") plt.legend(loc="lower right", prop={"size": 10}) plt.suptitle(title) plt.ylabel("norm discrepancy") plt.xlabel("n_iter") def svd_timing( X, n_comps, n_iter, n_oversamples, power_iteration_normalizer="auto", method=None ): """ Measure time for decomposition """ print("... running SVD ...") if method != "fbpca": gc.collect() t0 = time() U, mu, V = randomized_svd( X, n_comps, n_oversamples, n_iter, power_iteration_normalizer, random_state=random_state, transpose=False, ) call_time = time() - t0 else: gc.collect() t0 = time() # There is a different convention for l here U, mu, V = fbpca.pca( X, n_comps, raw=True, n_iter=n_iter, l=n_oversamples + n_comps ) call_time = time() - t0 return U, mu, V, call_time def norm_diff(A, norm=2, msg=True, random_state=None): """ Compute the norm diff with the original matrix, when randomized SVD is called with *params. norm: 2 => spectral; 'fro' => Frobenius """ if msg: print("... computing %s norm ..." % norm) if norm == 2: # s = sp.linalg.norm(A, ord=2) # slow v0 = _init_arpack_v0(min(A.shape), random_state) value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False, v0=v0) else: if sp.sparse.issparse(A): value = sp.sparse.linalg.norm(A, ord=norm) else: value = sp.linalg.norm(A, ord=norm) return value def scalable_frobenius_norm_discrepancy(X, U, s, V): # if the input is not too big, just call scipy if X.shape[0] * X.shape[1] < MAX_MEMORY: A = X - U.dot(np.diag(s).dot(V)) return norm_diff(A, norm="fro") print("... computing fro norm by batches...") batch_size = 1000 Vhat = np.diag(s).dot(V) cum_norm = 0.0 for batch in gen_batches(X.shape[0], batch_size): M = X[batch, :] - U[batch, :].dot(Vhat) cum_norm += norm_diff(M, norm="fro", msg=False) return np.sqrt(cum_norm) def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps): all_time = defaultdict(list) if enable_spectral_norm: all_spectral = defaultdict(list) X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) all_frobenius = defaultdict(list) X_fro_norm = norm_diff(X, norm="fro", msg=False) for pi in power_iter: for pm in ["none", "LU", "QR"]: print("n_iter = %d on sklearn - %s" % (pi, pm)) U, s, V, time = svd_timing( X, n_comps, n_iter=pi, power_iteration_normalizer=pm, n_oversamples=n_oversamples, ) label = "sklearn - %s" % pm all_time[label].append(time) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if fbpca_available: print("n_iter = %d on fbca" % (pi)) U, s, V, time = svd_timing( X, n_comps, n_iter=pi, power_iteration_normalizer=pm, n_oversamples=n_oversamples, method="fbpca", ) label = "fbpca" all_time[label].append(time) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if enable_spectral_norm: title = "%s: spectral norm diff vs running time" % (dataset_name) plot_time_vs_s(all_time, all_spectral, power_iter, title) title = "%s: Frobenius norm diff vs running time" % (dataset_name) plot_time_vs_s(all_time, all_frobenius, power_iter, title) def bench_b(power_list): n_samples, n_features = 1000, 10000 data_params = { "n_samples": n_samples, "n_features": n_features, "tail_strength": 0.7, "random_state": random_state, } dataset_name = "low rank matrix %d x %d" % (n_samples, n_features) ranks = [10, 50, 100] if enable_spectral_norm: all_spectral = defaultdict(list) all_frobenius = defaultdict(list) for rank in ranks: X = make_low_rank_matrix(effective_rank=rank, **data_params) if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) X_fro_norm = norm_diff(X, norm="fro", msg=False) for n_comp in [int(rank / 2), rank, rank * 2]: label = "rank=%d, n_comp=%d" % (rank, n_comp) print(label) for pi in power_list: U, s, V, _ = svd_timing( X, n_comp, n_iter=pi, n_oversamples=2, power_iteration_normalizer="LU", ) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if enable_spectral_norm: title = "%s: spectral norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_spectral, title) title = "%s: Frobenius norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_frobenius, title) def bench_c(datasets, n_comps): all_time = defaultdict(list) if enable_spectral_norm: all_spectral = defaultdict(list) all_frobenius = defaultdict(list) for dataset_name in datasets: X = get_data(dataset_name) if X is None: continue if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) X_fro_norm = norm_diff(X, norm="fro", msg=False) n_comps = np.minimum(n_comps, np.min(X.shape)) label = "sklearn" print("%s %d x %d - %s" % (dataset_name, X.shape[0], X.shape[1], label)) U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=10, method=label) all_time[label].append(time) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if fbpca_available: label = "fbpca" print("%s %d x %d - %s" % (dataset_name, X.shape[0], X.shape[1], label)) U, s, V, time = svd_timing( X, n_comps, n_iter=2, n_oversamples=2, method=label ) all_time[label].append(time) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if len(all_time) == 0: raise ValueError("No tests ran. Aborting.") if enable_spectral_norm: title = "normalized spectral norm diff vs running time" scatter_time_vs_s(all_time, all_spectral, datasets, title) title = "normalized Frobenius norm diff vs running time" scatter_time_vs_s(all_time, all_frobenius, datasets, title) if __name__ == "__main__": random_state = check_random_state(1234) power_iter = np.linspace(0, 6, 7, dtype=int) n_comps = 50 for dataset_name in datasets: X = get_data(dataset_name) if X is None: continue print( " >>>>>> Benching sklearn and fbpca on %s %d x %d" % (dataset_name, X.shape[0], X.shape[1]) ) bench_a( X, dataset_name, power_iter, n_oversamples=2, n_comps=np.minimum(n_comps, np.min(X.shape)), ) print(" >>>>>> Benching on simulated low rank matrix with variable rank") bench_b(power_iter) print(" >>>>>> Benching sklearn and fbpca default configurations") bench_c(datasets + big_sparse_datasets, n_comps) plt.show() ================================================ FILE: benchmarks/bench_plot_svd.py ================================================ """Benchmarks of Singular Value Decomposition (Exact and Approximate) The data is mostly low rank but is a fat infinite tail. """ import gc from time import time import numpy as np from collections import defaultdict from scipy.linalg import svd from sklearn.utils.extmath import randomized_svd from sklearn.datasets import make_low_rank_matrix def compute_bench(samples_range, features_range, n_iter=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print("====================") print("Iteration %03d of %03d" % (it, max_it)) print("====================") X = make_low_rank_matrix( n_samples, n_features, effective_rank=rank, tail_strength=0.2 ) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) results["scipy svd"].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=0") tstart = time() randomized_svd(X, rank, n_iter=0) results["scikit-learn randomized_svd (n_iter=0)"].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=%d " % n_iter) tstart = time() randomized_svd(X, rank, n_iter=n_iter) results["scikit-learn randomized_svd (n_iter=%d)" % n_iter].append( time() - tstart ) return results if __name__ == "__main__": from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(2, 1000, 4).astype(int) features_range = np.linspace(2, 1000, 4).astype(int) results = compute_bench(samples_range, features_range) label = "scikit-learn singular value decomposition benchmark results" fig = plt.figure(label) ax = fig.gca(projection="3d") for c, (label, timings) in zip("rbg", sorted(results.items())): X, Y = np.meshgrid(samples_range, features_range) Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) # plot the actual surface ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, color=c) # dummy point plot to stick the legend to since surface plot do not # support legends (yet?) ax.plot([1], [1], [1], color=c, label=label) ax.set_xlabel("n_samples") ax.set_ylabel("n_features") ax.set_zlabel("Time (s)") ax.legend() plt.show() ================================================ FILE: benchmarks/bench_plot_ward.py ================================================ """ Benchmark scikit-learn's Ward implement compared to SciPy's """ import time import numpy as np from scipy.cluster import hierarchy import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering ward = AgglomerativeClustering(n_clusters=3, linkage="ward") n_samples = np.logspace(0.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 ratio = scikits_time / scipy_time plt.figure("scikit-learn Ward's method benchmark results") plt.imshow(np.log(ratio), aspect="auto", origin="lower") plt.colorbar() plt.contour( ratio, levels=[ 1, ], colors="k", ) plt.yticks(range(len(n_features)), n_features.astype(int)) plt.ylabel("N features") plt.xticks(range(len(n_samples)), n_samples.astype(int)) plt.xlabel("N samples") plt.title("Scikit's time, in units of scipy time (log)") plt.show() ================================================ FILE: benchmarks/bench_random_projections.py ================================================ """ =========================== Random projection benchmark =========================== Benchmarks for random projections. """ import gc import sys import optparse from datetime import datetime import collections import numpy as np import scipy.sparse as sp from sklearn import clone from sklearn.random_projection import ( SparseRandomProjection, GaussianRandomProjection, johnson_lindenstrauss_min_dim, ) def type_auto_or_float(val): if val == "auto": return "auto" else: return float(val) def type_auto_or_int(val): if val == "auto": return "auto" else: return int(val) def compute_time(t_start, delta): mu_second = 0.0 + 10 ** 6 # number of microseconds in a second return delta.seconds + delta.microseconds / mu_second def bench_scikit_transformer(X, transformer): gc.collect() clf = clone(transformer) # start time t_start = datetime.now() clf.fit(X) delta = datetime.now() - t_start # stop time time_to_fit = compute_time(t_start, delta) # start time t_start = datetime.now() clf.transform(X) delta = datetime.now() - t_start # stop time time_to_transform = compute_time(t_start, delta) return time_to_fit, time_to_transform # Make some random data with uniformly located non zero entries with # Gaussian distributed values def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=None): rng = np.random.RandomState(random_state) data_coo = sp.coo_matrix( ( rng.randn(n_nonzeros), ( rng.randint(n_samples, size=n_nonzeros), rng.randint(n_features, size=n_nonzeros), ), ), shape=(n_samples, n_features), ) return data_coo.toarray(), data_coo.tocsr() def print_row(clf_type, time_fit, time_transform): print( "%s | %s | %s" % ( clf_type.ljust(30), ("%.4fs" % time_fit).center(12), ("%.4fs" % time_transform).center(12), ) ) if __name__ == "__main__": ########################################################################### # Option parser ########################################################################### op = optparse.OptionParser() op.add_option( "--n-times", dest="n_times", default=5, type=int, help="Benchmark results are average over n_times experiments", ) op.add_option( "--n-features", dest="n_features", default=10 ** 4, type=int, help="Number of features in the benchmarks", ) op.add_option( "--n-components", dest="n_components", default="auto", help="Size of the random subspace. ('auto' or int > 0)", ) op.add_option( "--ratio-nonzeros", dest="ratio_nonzeros", default=10 ** -3, type=float, help="Number of features in the benchmarks", ) op.add_option( "--n-samples", dest="n_samples", default=500, type=int, help="Number of samples in the benchmarks", ) op.add_option( "--random-seed", dest="random_seed", default=13, type=int, help="Seed used by the random number generators.", ) op.add_option( "--density", dest="density", default=1 / 3, help=( "Density used by the sparse random projection. ('auto' or float (0.0, 1.0]" ), ) op.add_option( "--eps", dest="eps", default=0.5, type=float, help="See the documentation of the underlying transformers.", ) op.add_option( "--transformers", dest="selected_transformers", default="GaussianRandomProjection,SparseRandomProjection", type=str, help=( "Comma-separated list of transformer to benchmark. " "Default: %default. Available: " "GaussianRandomProjection,SparseRandomProjection" ), ) op.add_option( "--dense", dest="dense", default=False, action="store_true", help="Set input space as a dense matrix.", ) (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) opts.n_components = type_auto_or_int(opts.n_components) opts.density = type_auto_or_float(opts.density) selected_transformers = opts.selected_transformers.split(",") ########################################################################### # Generate dataset ########################################################################### n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) print("Dataset statistics") print("===========================") print("n_samples \t= %s" % opts.n_samples) print("n_features \t= %s" % opts.n_features) if opts.n_components == "auto": print( "n_components \t= %s (auto)" % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps) ) else: print("n_components \t= %s" % opts.n_components) print("n_elements \t= %s" % (opts.n_features * opts.n_samples)) print("n_nonzeros \t= %s per feature" % n_nonzeros) print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros) print("") ########################################################################### # Set transformer input ########################################################################### transformers = {} ########################################################################### # Set GaussianRandomProjection input gaussian_matrix_params = { "n_components": opts.n_components, "random_state": opts.random_seed, } transformers["GaussianRandomProjection"] = GaussianRandomProjection( **gaussian_matrix_params ) ########################################################################### # Set SparseRandomProjection input sparse_matrix_params = { "n_components": opts.n_components, "random_state": opts.random_seed, "density": opts.density, "eps": opts.eps, } transformers["SparseRandomProjection"] = SparseRandomProjection( **sparse_matrix_params ) ########################################################################### # Perform benchmark ########################################################################### time_fit = collections.defaultdict(list) time_transform = collections.defaultdict(list) print("Benchmarks") print("===========================") print("Generate dataset benchmarks... ", end="") X_dense, X_sparse = make_sparse_random_data( opts.n_samples, opts.n_features, n_nonzeros, random_state=opts.random_seed ) X = X_dense if opts.dense else X_sparse print("done") for name in selected_transformers: print("Perform benchmarks for %s..." % name) for iteration in range(opts.n_times): print("\titer %s..." % iteration, end="") time_to_fit, time_to_transform = bench_scikit_transformer( X_dense, transformers[name] ) time_fit[name].append(time_to_fit) time_transform[name].append(time_to_transform) print("done") print("") ########################################################################### # Print results ########################################################################### print("Script arguments") print("===========================") arguments = vars(opts) print( "%s \t | %s " % ( "Arguments".ljust(16), "Value".center(12), ) ) print(25 * "-" + ("|" + "-" * 14) * 1) for key, value in arguments.items(): print("%s \t | %s " % (str(key).ljust(16), str(value).strip().center(12))) print("") print("Transformer performance:") print("===========================") print("Results are averaged over %s repetition(s)." % opts.n_times) print("") print( "%s | %s | %s" % ("Transformer".ljust(30), "fit".center(12), "transform".center(12)) ) print(31 * "-" + ("|" + "-" * 14) * 2) for name in sorted(selected_transformers): print_row(name, np.mean(time_fit[name]), np.mean(time_transform[name])) print("") print("") ================================================ FILE: benchmarks/bench_rcv1_logreg_convergence.py ================================================ # Authors: Tom Dupre la Tour # Olivier Grisel # # License: BSD 3 clause import matplotlib.pyplot as plt from joblib import Memory import numpy as np import gc import time from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.datasets import fetch_rcv1 from sklearn.linear_model._sag import get_auto_step_size try: import lightning.classification as lightning_clf except ImportError: lightning_clf = None m = Memory(cachedir=".", verbose=0) # compute logistic loss def get_loss(w, intercept, myX, myy, C): n_samples = myX.shape[0] w = w.ravel() p = np.mean(np.log(1.0 + np.exp(-myy * (myX.dot(w) + intercept)))) print("%f + %f" % (p, w.dot(w) / 2.0 / C / n_samples)) p += w.dot(w) / 2.0 / C / n_samples return p # We use joblib to cache individual fits. Note that we do not pass the dataset # as argument as the hashing would be too slow, so we assume that the dataset # never changes. @m.cache() def bench_one(name, clf_type, clf_params, n_iter): clf = clf_type(**clf_params) try: clf.set_params(max_iter=n_iter, random_state=42) except Exception: clf.set_params(n_iter=n_iter, random_state=42) st = time.time() clf.fit(X, y) end = time.time() try: C = 1.0 / clf.alpha / n_samples except Exception: C = clf.C try: intercept = clf.intercept_ except Exception: intercept = 0.0 train_loss = get_loss(clf.coef_, intercept, X, y, C) train_score = clf.score(X, y) test_score = clf.score(X_test, y_test) duration = end - st return train_loss, train_score, test_score, duration def bench(clfs): for ( name, clf, iter_range, train_losses, train_scores, test_scores, durations, ) in clfs: print("training %s" % name) clf_type = type(clf) clf_params = clf.get_params() for n_iter in iter_range: gc.collect() train_loss, train_score, test_score, duration = bench_one( name, clf_type, clf_params, n_iter ) train_losses.append(train_loss) train_scores.append(train_score) test_scores.append(test_score) durations.append(duration) print("classifier: %s" % name) print("train_loss: %.8f" % train_loss) print("train_score: %.8f" % train_score) print("test_score: %.8f" % test_score) print("time for fit: %.8f seconds" % duration) print("") print("") return clfs def plot_train_losses(clfs): plt.figure() for (name, _, _, train_losses, _, _, durations) in clfs: plt.plot(durations, train_losses, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("train loss") def plot_train_scores(clfs): plt.figure() for (name, _, _, _, train_scores, _, durations) in clfs: plt.plot(durations, train_scores, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("train score") plt.ylim((0.92, 0.96)) def plot_test_scores(clfs): plt.figure() for (name, _, _, _, _, test_scores, durations) in clfs: plt.plot(durations, test_scores, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("test score") plt.ylim((0.92, 0.96)) def plot_dloss(clfs): plt.figure() pobj_final = [] for (name, _, _, train_losses, _, _, durations) in clfs: pobj_final.append(train_losses[-1]) indices = np.argsort(pobj_final) pobj_best = pobj_final[indices[0]] for (name, _, _, train_losses, _, _, durations) in clfs: log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10) plt.plot(durations, log_pobj, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("log(best - train_loss)") def get_max_squared_sum(X): """Get the maximum row-wise sum of squares""" return np.sum(X ** 2, axis=1).max() rcv1 = fetch_rcv1() X = rcv1.data n_samples, n_features = X.shape # consider the binary classification problem 'CCAT' vs the rest ccat_idx = rcv1.target_names.tolist().index("CCAT") y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64) y[y == 0] = -1 # parameters C = 1.0 fit_intercept = True tol = 1.0e-14 # max_iter range sgd_iter_range = list(range(1, 121, 10)) newton_iter_range = list(range(1, 25, 3)) lbfgs_iter_range = list(range(1, 242, 12)) liblinear_iter_range = list(range(1, 37, 3)) liblinear_dual_iter_range = list(range(1, 85, 6)) sag_iter_range = list(range(1, 37, 3)) clfs = [ ( "LR-liblinear", LogisticRegression( C=C, tol=tol, solver="liblinear", fit_intercept=fit_intercept, intercept_scaling=1, ), liblinear_iter_range, [], [], [], [], ), ( "LR-liblinear-dual", LogisticRegression( C=C, tol=tol, dual=True, solver="liblinear", fit_intercept=fit_intercept, intercept_scaling=1, ), liblinear_dual_iter_range, [], [], [], [], ), ( "LR-SAG", LogisticRegression(C=C, tol=tol, solver="sag", fit_intercept=fit_intercept), sag_iter_range, [], [], [], [], ), ( "LR-newton-cg", LogisticRegression( C=C, tol=tol, solver="newton-cg", fit_intercept=fit_intercept ), newton_iter_range, [], [], [], [], ), ( "LR-lbfgs", LogisticRegression(C=C, tol=tol, solver="lbfgs", fit_intercept=fit_intercept), lbfgs_iter_range, [], [], [], [], ), ( "SGD", SGDClassifier( alpha=1.0 / C / n_samples, penalty="l2", loss="log", fit_intercept=fit_intercept, verbose=0, ), sgd_iter_range, [], [], [], [], ), ] if lightning_clf is not None and not fit_intercept: alpha = 1.0 / C / n_samples # compute the same step_size than in LR-sag max_squared_sum = get_max_squared_sum(X) step_size = get_auto_step_size(max_squared_sum, alpha, "log", fit_intercept) clfs.append( ( "Lightning-SVRG", lightning_clf.SVRGClassifier( alpha=alpha, eta=step_size, tol=tol, loss="log" ), sag_iter_range, [], [], [], [], ) ) clfs.append( ( "Lightning-SAG", lightning_clf.SAGClassifier( alpha=alpha, eta=step_size, tol=tol, loss="log" ), sag_iter_range, [], [], [], [], ) ) # We keep only 200 features, to have a dense dataset, # and compare to lightning SAG, which seems incorrect in the sparse case. X_csc = X.tocsc() nnz_in_each_features = X_csc.indptr[1:] - X_csc.indptr[:-1] X = X_csc[:, np.argsort(nnz_in_each_features)[-200:]] X = X.toarray() print("dataset: %.3f MB" % (X.nbytes / 1e6)) # Split training and testing. Switch train and test subset compared to # LYRL2004 split, to have a larger training dataset. n = 23149 X_test = X[:n, :] y_test = y[:n] X = X[n:, :] y = y[n:] clfs = bench(clfs) plot_train_scores(clfs) plot_test_scores(clfs) plot_train_losses(clfs) plot_dloss(clfs) plt.show() ================================================ FILE: benchmarks/bench_saga.py ================================================ """Author: Arthur Mensch, Nelle Varoquaux Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain in using multinomial logistic regression in term of learning time. """ import json import time import os from joblib import Parallel from sklearn.utils.fixes import delayed import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import ( fetch_rcv1, load_iris, load_digits, fetch_20newsgroups_vectorized, ) from sklearn.linear_model import LogisticRegression from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.utils.extmath import safe_sparse_dot, softmax def fit_single( solver, X, y, penalty="l2", single_target=True, C=1, max_iter=10, skip_slow=False, dtype=np.float64, ): if skip_slow and solver == "lightning" and penalty == "l1": print("skip_slowping l1 logistic regression with solver lightning.") return print( "Solving %s logistic regression with penalty %s, solver %s." % ("binary" if single_target else "multinomial", penalty, solver) ) if solver == "lightning": from lightning.classification import SAGAClassifier if single_target or solver not in ["sag", "saga"]: multi_class = "ovr" else: multi_class = "multinomial" X = X.astype(dtype) y = y.astype(dtype) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, stratify=y ) n_samples = X_train.shape[0] n_classes = np.unique(y_train).shape[0] test_scores = [1] train_scores = [1] accuracies = [1 / n_classes] times = [0] if penalty == "l2": alpha = 1.0 / (C * n_samples) beta = 0 lightning_penalty = None else: alpha = 0.0 beta = 1.0 / (C * n_samples) lightning_penalty = "l1" for this_max_iter in range(1, max_iter + 1, 2): print( "[%s, %s, %s] Max iter: %s" % ( "binary" if single_target else "multinomial", penalty, solver, this_max_iter, ) ) if solver == "lightning": lr = SAGAClassifier( loss="log", alpha=alpha, beta=beta, penalty=lightning_penalty, tol=-1, max_iter=this_max_iter, ) else: lr = LogisticRegression( solver=solver, multi_class=multi_class, C=C, penalty=penalty, fit_intercept=False, tol=0, max_iter=this_max_iter, random_state=42, ) # Makes cpu cache even for all fit calls X_train.max() t0 = time.clock() lr.fit(X_train, y_train) train_time = time.clock() - t0 scores = [] for (X, y) in [(X_train, y_train), (X_test, y_test)]: try: y_pred = lr.predict_proba(X) except NotImplementedError: # Lightning predict_proba is not implemented for n_classes > 2 y_pred = _predict_proba(lr, X) score = log_loss(y, y_pred, normalize=False) / n_samples score += 0.5 * alpha * np.sum(lr.coef_ ** 2) + beta * np.sum( np.abs(lr.coef_) ) scores.append(score) train_score, test_score = tuple(scores) y_pred = lr.predict(X_test) accuracy = np.sum(y_pred == y_test) / y_test.shape[0] test_scores.append(test_score) train_scores.append(train_score) accuracies.append(accuracy) times.append(train_time) return lr, times, train_scores, test_scores, accuracies def _predict_proba(lr, X): pred = safe_sparse_dot(X, lr.coef_.T) if hasattr(lr, "intercept_"): pred += lr.intercept_ return softmax(pred) def exp( solvers, penalty, single_target, n_samples=30000, max_iter=20, dataset="rcv1", n_jobs=1, skip_slow=False, ): dtypes_mapping = { "float64": np.float64, "float32": np.float32, } if dataset == "rcv1": rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == "digits": X, y = load_digits(return_X_y=True) if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == "iris": iris = load_iris() X, y = iris.data, iris.target elif dataset == "20newspaper": ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(fit_single)( solver, X, y, penalty=penalty, single_target=single_target, dtype=dtype, C=1, max_iter=max_iter, skip_slow=skip_slow, ) for solver in solvers for dtype in dtypes_mapping.values() ) res = [] idx = 0 for dtype_name in dtypes_mapping.keys(): for solver in solvers: if not (skip_slow and solver == "lightning" and penalty == "l1"): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict( solver=solver, penalty=penalty, dtype=dtype_name, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies, ) res.append(this_res) idx += 1 with open("bench_saga.json", "w+") as f: json.dump(res, f) def plot(outname=None): import pandas as pd with open("bench_saga.json", "r") as f: f = json.load(f) res = pd.DataFrame(f) res.set_index(["single_target"], inplace=True) grouped = res.groupby(level=["single_target"]) colors = {"saga": "C0", "liblinear": "C1", "lightning": "C2"} linestyles = {"float32": "--", "float64": "-"} alpha = {"float64": 0.5, "float32": 1} for idx, group in grouped: single_target = idx fig, axes = plt.subplots(figsize=(12, 4), ncols=4) ax = axes[0] for scores, times, solver, dtype in zip( group["train_scores"], group["times"], group["solver"], group["dtype"] ): ax.plot( times, scores, label="%s - %s" % (solver, dtype), color=colors[solver], alpha=alpha[dtype], marker=".", linestyle=linestyles[dtype], ) ax.axvline( times[-1], color=colors[solver], alpha=alpha[dtype], linestyle=linestyles[dtype], ) ax.set_xlabel("Time (s)") ax.set_ylabel("Training objective (relative to min)") ax.set_yscale("log") ax = axes[1] for scores, times, solver, dtype in zip( group["test_scores"], group["times"], group["solver"], group["dtype"] ): ax.plot( times, scores, label=solver, color=colors[solver], linestyle=linestyles[dtype], marker=".", alpha=alpha[dtype], ) ax.axvline( times[-1], color=colors[solver], alpha=alpha[dtype], linestyle=linestyles[dtype], ) ax.set_xlabel("Time (s)") ax.set_ylabel("Test objective (relative to min)") ax.set_yscale("log") ax = axes[2] for accuracy, times, solver, dtype in zip( group["accuracies"], group["times"], group["solver"], group["dtype"] ): ax.plot( times, accuracy, label="%s - %s" % (solver, dtype), alpha=alpha[dtype], marker=".", color=colors[solver], linestyle=linestyles[dtype], ) ax.axvline( times[-1], color=colors[solver], alpha=alpha[dtype], linestyle=linestyles[dtype], ) ax.set_xlabel("Time (s)") ax.set_ylabel("Test accuracy") ax.legend() name = "single_target" if single_target else "multi_target" name += "_%s" % penalty plt.suptitle(name) if outname is None: outname = name + ".png" fig.tight_layout() fig.subplots_adjust(top=0.9) ax = axes[3] for scores, times, solver, dtype in zip( group["train_scores"], group["times"], group["solver"], group["dtype"] ): ax.plot( np.arange(len(scores)), scores, label="%s - %s" % (solver, dtype), marker=".", alpha=alpha[dtype], color=colors[solver], linestyle=linestyles[dtype], ) ax.set_yscale("log") ax.set_xlabel("# iterations") ax.set_ylabel("Objective function") ax.legend() plt.savefig(outname) if __name__ == "__main__": solvers = ["saga", "liblinear", "lightning"] penalties = ["l1", "l2"] n_samples = [100000, 300000, 500000, 800000, None] single_target = True for penalty in penalties: for n_sample in n_samples: exp( solvers, penalty, single_target, n_samples=n_sample, n_jobs=1, dataset="rcv1", max_iter=10, ) if n_sample is not None: outname = "figures/saga_%s_%d.png" % (penalty, n_sample) else: outname = "figures/saga_%s_all.png" % (penalty,) try: os.makedirs("figures") except OSError: pass plot(outname) ================================================ FILE: benchmarks/bench_sample_without_replacement.py ================================================ """ Benchmarks for sampling without replacement of integer. """ import gc import sys import optparse from datetime import datetime import operator import matplotlib.pyplot as plt import numpy as np import random from sklearn.utils.random import sample_without_replacement def compute_time(t_start, delta): mu_second = 0.0 + 10 ** 6 # number of microseconds in a second return delta.seconds + delta.microseconds / mu_second def bench_sample(sampling, n_population, n_samples): gc.collect() # start time t_start = datetime.now() sampling(n_population, n_samples) delta = datetime.now() - t_start # stop time time = compute_time(t_start, delta) return time if __name__ == "__main__": ########################################################################### # Option parser ########################################################################### op = optparse.OptionParser() op.add_option( "--n-times", dest="n_times", default=5, type=int, help="Benchmark results are average over n_times experiments", ) op.add_option( "--n-population", dest="n_population", default=100000, type=int, help="Size of the population to sample from.", ) op.add_option( "--n-step", dest="n_steps", default=5, type=int, help="Number of step interval between 0 and n_population.", ) default_algorithms = ( "custom-tracking-selection,custom-auto," "custom-reservoir-sampling,custom-pool," "python-core-sample,numpy-permutation" ) op.add_option( "--algorithm", dest="selected_algorithm", default=default_algorithms, type=str, help=( "Comma-separated list of transformer to benchmark. " "Default: %default. \nAvailable: %default" ), ) # op.add_option("--random-seed", # dest="random_seed", default=13, type=int, # help="Seed used by the random number generators.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) selected_algorithm = opts.selected_algorithm.split(",") for key in selected_algorithm: if key not in default_algorithms.split(","): raise ValueError( 'Unknown sampling algorithm "%s" not in (%s).' % (key, default_algorithms) ) ########################################################################### # List sampling algorithm ########################################################################### # We assume that sampling algorithm has the following signature: # sample(n_population, n_sample) # sampling_algorithm = {} ########################################################################### # Set Python core input sampling_algorithm[ "python-core-sample" ] = lambda n_population, n_sample: random.sample(range(n_population), n_sample) ########################################################################### # Set custom automatic method selection sampling_algorithm[ "custom-auto" ] = lambda n_population, n_samples, random_state=None: sample_without_replacement( n_population, n_samples, method="auto", random_state=random_state ) ########################################################################### # Set custom tracking based method sampling_algorithm[ "custom-tracking-selection" ] = lambda n_population, n_samples, random_state=None: sample_without_replacement( n_population, n_samples, method="tracking_selection", random_state=random_state ) ########################################################################### # Set custom reservoir based method sampling_algorithm[ "custom-reservoir-sampling" ] = lambda n_population, n_samples, random_state=None: sample_without_replacement( n_population, n_samples, method="reservoir_sampling", random_state=random_state ) ########################################################################### # Set custom reservoir based method sampling_algorithm[ "custom-pool" ] = lambda n_population, n_samples, random_state=None: sample_without_replacement( n_population, n_samples, method="pool", random_state=random_state ) ########################################################################### # Numpy permutation based sampling_algorithm[ "numpy-permutation" ] = lambda n_population, n_sample: np.random.permutation(n_population)[:n_sample] ########################################################################### # Remove unspecified algorithm sampling_algorithm = { key: value for key, value in sampling_algorithm.items() if key in selected_algorithm } ########################################################################### # Perform benchmark ########################################################################### time = {} n_samples = np.linspace(start=0, stop=opts.n_population, num=opts.n_steps).astype( int ) ratio = n_samples / opts.n_population print("Benchmarks") print("===========================") for name in sorted(sampling_algorithm): print("Perform benchmarks for %s..." % name, end="") time[name] = np.zeros(shape=(opts.n_steps, opts.n_times)) for step in range(opts.n_steps): for it in range(opts.n_times): time[name][step, it] = bench_sample( sampling_algorithm[name], opts.n_population, n_samples[step] ) print("done") print("Averaging results...", end="") for name in sampling_algorithm: time[name] = np.mean(time[name], axis=1) print("done\n") # Print results ########################################################################### print("Script arguments") print("===========================") arguments = vars(opts) print( "%s \t | %s " % ( "Arguments".ljust(16), "Value".center(12), ) ) print(25 * "-" + ("|" + "-" * 14) * 1) for key, value in arguments.items(): print("%s \t | %s " % (str(key).ljust(16), str(value).strip().center(12))) print("") print("Sampling algorithm performance:") print("===============================") print("Results are averaged over %s repetition(s)." % opts.n_times) print("") fig = plt.figure("scikit-learn sample w/o replacement benchmark results") plt.title("n_population = %s, n_times = %s" % (opts.n_population, opts.n_times)) ax = fig.add_subplot(111) for name in sampling_algorithm: ax.plot(ratio, time[name], label=name) ax.set_xlabel("ratio of n_sample / n_population") ax.set_ylabel("Time (s)") ax.legend() # Sort legend labels handles, labels = ax.get_legend_handles_labels() hl = sorted(zip(handles, labels), key=operator.itemgetter(1)) handles2, labels2 = zip(*hl) ax.legend(handles2, labels2, loc=0) plt.show() ================================================ FILE: benchmarks/bench_sgd_regression.py ================================================ # Author: Peter Prettenhofer # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt import gc from time import time from sklearn.linear_model import Ridge, SGDRegressor, ElasticNet from sklearn.metrics import mean_squared_error from sklearn.datasets import make_regression """ Benchmark for SGD regression Compares SGD regression against coordinate descent and Ridge on synthetic data. """ print(__doc__) if __name__ == "__main__": list_n_samples = np.linspace(100, 10000, 5).astype(int) list_n_features = [10, 100, 1000] n_test = 1000 max_iter = 1000 noise = 0.1 alpha = 0.01 sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) asgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) for i, n_train in enumerate(list_n_samples): for j, n_features in enumerate(list_n_features): X, y, coef = make_regression( n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True, ) X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] print("=======================") print("Round %d %d" % (i, j)) print("n_features:", n_features) print("n_samples:", n_train) # Shuffle data idx = np.arange(n_train) np.random.seed(13) np.random.shuffle(idx) X_train = X_train[idx] y_train = y_train[idx] std = X_train.std(axis=0) mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std std = y_train.std(axis=0) mean = y_train.mean(axis=0) y_train = (y_train - mean) / std y_test = (y_test - mean) / std gc.collect() print("- benchmarking ElasticNet") clf = ElasticNet(alpha=alpha, l1_ratio=0.5, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) elnet_results[i, j, 1] = time() - tstart gc.collect() print("- benchmarking SGD") clf = SGDRegressor( alpha=alpha / n_train, fit_intercept=False, max_iter=max_iter, learning_rate="invscaling", eta0=0.01, power_t=0.25, tol=1e-3, ) tstart = time() clf.fit(X_train, y_train) sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) sgd_results[i, j, 1] = time() - tstart gc.collect() print("max_iter", max_iter) print("- benchmarking A-SGD") clf = SGDRegressor( alpha=alpha / n_train, fit_intercept=False, max_iter=max_iter, learning_rate="invscaling", eta0=0.002, power_t=0.05, tol=1e-3, average=(max_iter * n_train // 2), ) tstart = time() clf.fit(X_train, y_train) asgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) asgd_results[i, j, 1] = time() - tstart gc.collect() print("- benchmarking RidgeRegression") clf = Ridge(alpha=alpha, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) ridge_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) ridge_results[i, j, 1] = time() - tstart # Plot results i = 0 m = len(list_n_features) plt.figure("scikit-learn SGD regression benchmark results", figsize=(5 * 2, 4 * m)) for j in range(m): plt.subplot(m, 2, i + 1) plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]), label="ElasticNet") plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]), label="SGDRegressor") plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]), label="A-SGDRegressor") plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]), label="Ridge") plt.legend(prop={"size": 10}) plt.xlabel("n_train") plt.ylabel("RMSE") plt.title("Test error - %d features" % list_n_features[j]) i += 1 plt.subplot(m, 2, i + 1) plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]), label="ElasticNet") plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]), label="SGDRegressor") plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]), label="A-SGDRegressor") plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]), label="Ridge") plt.legend(prop={"size": 10}) plt.xlabel("n_train") plt.ylabel("Time [sec]") plt.title("Training time - %d features" % list_n_features[j]) i += 1 plt.subplots_adjust(hspace=0.30) plt.show() ================================================ FILE: benchmarks/bench_sparsify.py ================================================ """ Benchmark SGD prediction time with dense/sparse coefficients. Invoke with ----------- $ kernprof.py -l sparsity_benchmark.py $ python -m line_profiler sparsity_benchmark.py.lprof Typical output -------------- input data sparsity: 0.050000 true coef sparsity: 0.000100 test data sparsity: 0.027400 model sparsity: 0.000024 r^2 on test data (dense model) : 0.233651 r^2 on test data (sparse model) : 0.233651 Wrote profile results to sparsity_benchmark.py.lprof Timer unit: 1e-06 s File: sparsity_benchmark.py Function: benchmark_dense_predict at line 51 Total time: 0.532979 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 51 @profile 52 def benchmark_dense_predict(): 53 301 640 2.1 0.1 for _ in range(300): 54 300 532339 1774.5 99.9 clf.predict(X_test) File: sparsity_benchmark.py Function: benchmark_sparse_predict at line 56 Total time: 0.39274 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 56 @profile 57 def benchmark_sparse_predict(): 58 1 10854 10854.0 2.8 X_test_sparse = csr_matrix(X_test) 59 301 477 1.6 0.1 for _ in range(300): 60 300 381409 1271.4 97.1 clf.predict(X_test_sparse) """ from scipy.sparse.csr import csr_matrix import numpy as np from sklearn.linear_model import SGDRegressor from sklearn.metrics import r2_score np.random.seed(42) def sparsity_ratio(X): return np.count_nonzero(X) / float(n_samples * n_features) n_samples, n_features = 5000, 300 X = np.random.randn(n_samples, n_features) inds = np.arange(n_samples) np.random.shuffle(inds) X[inds[int(n_features / 1.2) :]] = 0 # sparsify input print("input data sparsity: %f" % sparsity_ratio(X)) coef = 3 * np.random.randn(n_features) inds = np.arange(n_features) np.random.shuffle(inds) coef[inds[n_features // 2 :]] = 0 # sparsify coef print("true coef sparsity: %f" % sparsity_ratio(coef)) y = np.dot(X, coef) # add noise y += 0.01 * np.random.normal((n_samples,)) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] print("test data sparsity: %f" % sparsity_ratio(X_test)) ############################################################################### clf = SGDRegressor(penalty="l1", alpha=0.2, max_iter=2000, tol=None) clf.fit(X_train, y_train) print("model sparsity: %f" % sparsity_ratio(clf.coef_)) def benchmark_dense_predict(): for _ in range(300): clf.predict(X_test) def benchmark_sparse_predict(): X_test_sparse = csr_matrix(X_test) for _ in range(300): clf.predict(X_test_sparse) def score(y_test, y_pred, case): r2 = r2_score(y_test, y_pred) print("r^2 on test data (%s) : %f" % (case, r2)) score(y_test, clf.predict(X_test), "dense model") benchmark_dense_predict() clf.sparsify() score(y_test, clf.predict(X_test), "sparse model") benchmark_sparse_predict() ================================================ FILE: benchmarks/bench_text_vectorizers.py ================================================ """ To run this benchmark, you will need, * scikit-learn * pandas * memory_profiler * psutil (optional, but recommended) """ import timeit import itertools import numpy as np import pandas as pd from memory_profiler import memory_usage from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import ( CountVectorizer, TfidfVectorizer, HashingVectorizer, ) n_repeat = 3 def run_vectorizer(Vectorizer, X, **params): def f(): vect = Vectorizer(**params) vect.fit_transform(X) return f text = fetch_20newsgroups(subset="train").data[:1000] print("=" * 80 + "\n#" + " Text vectorizers benchmark" + "\n" + "=" * 80 + "\n") print("Using a subset of the 20 newsgroups dataset ({} documents).".format(len(text))) print("This benchmarks runs in ~1 min ...") res = [] for Vectorizer, (analyzer, ngram_range) in itertools.product( [CountVectorizer, TfidfVectorizer, HashingVectorizer], [("word", (1, 1)), ("word", (1, 2)), ("char", (4, 4)), ("char_wb", (4, 4))], ): bench = {"vectorizer": Vectorizer.__name__} params = {"analyzer": analyzer, "ngram_range": ngram_range} bench.update(params) dt = timeit.repeat( run_vectorizer(Vectorizer, text, **params), number=1, repeat=n_repeat ) bench["time"] = "{:.3f} (+-{:.3f})".format(np.mean(dt), np.std(dt)) mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params)) bench["memory"] = "{:.1f}".format(np.max(mem_usage)) res.append(bench) df = pd.DataFrame(res).set_index(["analyzer", "ngram_range", "vectorizer"]) print("\n========== Run time performance (sec) ===========\n") print( "Computing the mean and the standard deviation " "of the run time over {} runs...\n".format(n_repeat) ) print(df["time"].unstack(level=-1)) print("\n=============== Memory usage (MB) ===============\n") print(df["memory"].unstack(level=-1)) ================================================ FILE: benchmarks/bench_tree.py ================================================ """ To run this, you'll need to have installed. * scikit-learn Does two benchmarks First, we fix a training set, increase the number of samples to classify and plot number of classified samples as a function of time. In the second benchmark, we increase the number of dimensions of the training set, classify a sample and plot the time taken as a function of the number of dimensions. """ import numpy as np import matplotlib.pyplot as plt import gc from datetime import datetime # to store the results scikit_classifier_results = [] scikit_regressor_results = [] mu_second = 0.0 + 10 ** 6 # number of microseconds in a second def bench_scikit_tree_classifier(X, Y): """Benchmark with scikit-learn decision tree classifier""" from sklearn.tree import DecisionTreeClassifier gc.collect() # start time tstart = datetime.now() clf = DecisionTreeClassifier() clf.fit(X, Y).predict(X) delta = datetime.now() - tstart # stop time scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second) def bench_scikit_tree_regressor(X, Y): """Benchmark with scikit-learn decision tree regressor""" from sklearn.tree import DecisionTreeRegressor gc.collect() # start time tstart = datetime.now() clf = DecisionTreeRegressor() clf.fit(X, Y).predict(X) delta = datetime.now() - tstart # stop time scikit_regressor_results.append(delta.seconds + delta.microseconds / mu_second) if __name__ == "__main__": print("============================================") print("Warning: this is going to take a looong time") print("============================================") n = 10 step = 10000 n_samples = 10000 dim = 10 n_classes = 10 for i in range(n): print("============================================") print("Entering iteration %s of %s" % (i, n)) print("============================================") n_samples += step X = np.random.randn(n_samples, dim) Y = np.random.randint(0, n_classes, (n_samples,)) bench_scikit_tree_classifier(X, Y) Y = np.random.randn(n_samples) bench_scikit_tree_regressor(X, Y) xx = range(0, n * step, step) plt.figure("scikit-learn tree benchmark results") plt.subplot(211) plt.title("Learning with varying number of samples") plt.plot(xx, scikit_classifier_results, "g-", label="classification") plt.plot(xx, scikit_regressor_results, "r-", label="regression") plt.legend(loc="upper left") plt.xlabel("number of samples") plt.ylabel("Time (s)") scikit_classifier_results = [] scikit_regressor_results = [] n = 10 step = 500 start_dim = 500 n_classes = 10 dim = start_dim for i in range(0, n): print("============================================") print("Entering iteration %s of %s" % (i, n)) print("============================================") dim += step X = np.random.randn(100, dim) Y = np.random.randint(0, n_classes, (100,)) bench_scikit_tree_classifier(X, Y) Y = np.random.randn(100) bench_scikit_tree_regressor(X, Y) xx = np.arange(start_dim, start_dim + n * step, step) plt.subplot(212) plt.title("Learning in high dimensional spaces") plt.plot(xx, scikit_classifier_results, "g-", label="classification") plt.plot(xx, scikit_regressor_results, "r-", label="regression") plt.legend(loc="upper left") plt.xlabel("number of dimensions") plt.ylabel("Time (s)") plt.axis("tight") plt.show() ================================================ FILE: benchmarks/bench_tsne_mnist.py ================================================ """ ============================= MNIST dataset T-SNE benchmark ============================= """ # License: BSD 3 clause import os import os.path as op from time import time import numpy as np import json import argparse from joblib import Memory from sklearn.datasets import fetch_openml from sklearn.manifold import TSNE from sklearn.neighbors import NearestNeighbors from sklearn.decomposition import PCA from sklearn.utils import check_array from sklearn.utils import shuffle as _shuffle from sklearn.utils._openmp_helpers import _openmp_effective_n_threads LOG_DIR = "mnist_tsne_output" if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) memory = Memory(os.path.join(LOG_DIR, "mnist_tsne_benchmark_data"), mmap_mode="r") @memory.cache def load_data(dtype=np.float32, order="C", shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_openml("mnist_784") X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features X /= 255 return X, y def nn_accuracy(X, X_embedded, k=1): """Accuracy of the first nearest neighbor""" knn = NearestNeighbors(n_neighbors=1, n_jobs=-1) _, neighbors_X = knn.fit(X).kneighbors() _, neighbors_X_embedded = knn.fit(X_embedded).kneighbors() return np.mean(neighbors_X == neighbors_X_embedded) def tsne_fit_transform(model, data): transformed = model.fit_transform(data) return transformed, model.n_iter_ def sanitize(filename): return filename.replace("/", "-").replace(" ", "_") if __name__ == "__main__": parser = argparse.ArgumentParser("Benchmark for t-SNE") parser.add_argument( "--order", type=str, default="C", help="Order of the input data" ) parser.add_argument("--perplexity", type=float, default=30) parser.add_argument( "--bhtsne", action="store_true", help=( "if set and the reference bhtsne code is " "correctly installed, run it in the benchmark." ), ) parser.add_argument( "--all", action="store_true", help=( "if set, run the benchmark with the whole MNIST." "dataset. Note that it will take up to 1 hour." ), ) parser.add_argument( "--profile", action="store_true", help="if set, run the benchmark with a memory profiler.", ) parser.add_argument("--verbose", type=int, default=0) parser.add_argument( "--pca-components", type=int, default=50, help="Number of principal components for preprocessing.", ) args = parser.parse_args() print("Used number of threads: {}".format(_openmp_effective_n_threads())) X, y = load_data(order=args.order) if args.pca_components > 0: t0 = time() X = PCA(n_components=args.pca_components).fit_transform(X) print( "PCA preprocessing down to {} dimensions took {:0.3f}s".format( args.pca_components, time() - t0 ) ) methods = [] # Put TSNE in methods tsne = TSNE( n_components=2, init="pca", perplexity=args.perplexity, verbose=args.verbose, n_iter=1000, ) methods.append(("sklearn TSNE", lambda data: tsne_fit_transform(tsne, data))) if args.bhtsne: try: from bhtsne.bhtsne import run_bh_tsne except ImportError as e: raise ImportError( """\ If you want comparison with the reference implementation, build the binary from source (https://github.com/lvdmaaten/bhtsne) in the folder benchmarks/bhtsne and add an empty `__init__.py` file in the folder: $ git clone git@github.com:lvdmaaten/bhtsne.git $ cd bhtsne $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 $ touch __init__.py $ cd .. """ ) from e def bhtsne(X): """Wrapper for the reference lvdmaaten/bhtsne implementation.""" # PCA preprocessing is done elsewhere in the benchmark script n_iter = -1 # TODO find a way to report the number of iterations return ( run_bh_tsne( X, use_pca=False, perplexity=args.perplexity, verbose=args.verbose > 0, ), n_iter, ) methods.append(("lvdmaaten/bhtsne", bhtsne)) if args.profile: try: from memory_profiler import profile except ImportError as e: raise ImportError( "To run the benchmark with `--profile`, you " "need to install `memory_profiler`. Please " "run `pip install memory_profiler`." ) from e methods = [(n, profile(m)) for n, m in methods] data_size = [100, 500, 1000, 5000, 10000] if args.all: data_size.append(70000) results = [] basename = os.path.basename(os.path.splitext(__file__)[0]) log_filename = os.path.join(LOG_DIR, basename + ".json") for n in data_size: X_train = X[:n] y_train = y[:n] n = X_train.shape[0] for name, method in methods: print("Fitting {} on {} samples...".format(name, n)) t0 = time() np.save( os.path.join(LOG_DIR, "mnist_{}_{}.npy".format("original", n)), X_train ) np.save( os.path.join(LOG_DIR, "mnist_{}_{}.npy".format("original_labels", n)), y_train, ) X_embedded, n_iter = method(X_train) duration = time() - t0 precision_5 = nn_accuracy(X_train, X_embedded) print( "Fitting {} on {} samples took {:.3f}s in {:d} iterations, " "nn accuracy: {:0.3f}".format(name, n, duration, n_iter, precision_5) ) results.append(dict(method=name, duration=duration, n_samples=n)) with open(log_filename, "w", encoding="utf-8") as f: json.dump(results, f) method_name = sanitize(name) np.save( op.join(LOG_DIR, "mnist_{}_{}.npy".format(method_name, n)), X_embedded ) ================================================ FILE: benchmarks/plot_tsne_mnist.py ================================================ import matplotlib.pyplot as plt import numpy as np import os.path as op import argparse LOG_DIR = "mnist_tsne_output" if __name__ == "__main__": parser = argparse.ArgumentParser("Plot benchmark results for t-SNE") parser.add_argument( "--labels", type=str, default=op.join(LOG_DIR, "mnist_original_labels_10000.npy"), help="1D integer numpy array for labels", ) parser.add_argument( "--embedding", type=str, default=op.join(LOG_DIR, "mnist_sklearn_TSNE_10000.npy"), help="2D float numpy array for embedded data", ) args = parser.parse_args() X = np.load(args.embedding) y = np.load(args.labels) for i in np.unique(y): mask = y == i plt.scatter(X[mask, 0], X[mask, 1], alpha=0.2, label=int(i)) plt.legend(loc="best") plt.show() ================================================ FILE: build_tools/Makefile ================================================ # Makefile for maintenance tools authors: python generate_authors_table.py ================================================ FILE: build_tools/azure/install.sh ================================================ #!/bin/bash set -e set -x UNAMESTR=`uname` if [[ "$DISTRIB" == "conda-mamba-pypy3" ]]; then # condaforge/mambaforge-pypy3 needs compilers apt-get -yq update apt-get -yq install build-essential fi make_conda() { TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show conda create -n $VIRTUALENV --yes $TO_INSTALL fi source activate $VIRTUALENV } setup_ccache() { echo "Setting up ccache" mkdir /tmp/ccache/ which ccache for name in gcc g++ cc c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++; do ln -s $(which ccache) "/tmp/ccache/${name}" done export PATH="/tmp/ccache/:${PATH}" ccache -M 256M } # imports get_dep source build_tools/shared.sh if [[ "$DISTRIB" == "conda" || "$DISTRIB" == *"mamba"* ]]; then if [[ "$CONDA_CHANNEL" != "" ]]; then TO_INSTALL="--override-channels -c $CONDA_CHANNEL" else TO_INSTALL="" fi if [[ "$DISTRIB" == *"pypy"* ]]; then TO_INSTALL="$TO_INSTALL pypy" else TO_INSTALL="$TO_INSTALL python=$PYTHON_VERSION" fi TO_INSTALL="$TO_INSTALL ccache pip blas[build=$BLAS]" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cython $CYTHON_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep joblib $JOBLIB_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep pandas $PANDAS_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep pyamg $PYAMG_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep Pillow $PILLOW_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep matplotlib $MATPLOTLIB_VERSION)" if [[ "$UNAMESTR" == "Darwin" ]]; then if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then # on macOS, install an OpenMP-enabled clang/llvm from conda-forge. # TODO: Remove !=1.1.0 when the following is fixed: # sklearn/svm/_libsvm.cpython-38-darwin.so, # 2): Symbol not found: _svm_check_parameter error TO_INSTALL="$TO_INSTALL compilers>=1.0.4,!=1.1.0 llvm-openmp" else # Without openmp, we use the system clang. Here we use /usr/bin/ar # instead because llvm-ar errors export AR=/usr/bin/ar fi else # FIXME: temporary fix to link against system libraries on linux export LDFLAGS="$LDFLAGS -Wl,--sysroot=/" fi make_conda $TO_INSTALL setup_ccache elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv ccache python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate setup_ccache python -m pip install $(get_dep cython $CYTHON_VERSION) \ $(get_dep joblib $JOBLIB_VERSION) elif [[ "$DISTRIB" == "debian-32" ]]; then apt-get update apt-get install -y python3-dev python3-numpy python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate setup_ccache python -m pip install $(get_dep cython $CYTHON_VERSION) \ $(get_dep joblib $JOBLIB_VERSION) elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then # FIXME: temporary fix to link against system libraries on linux export LDFLAGS="$LDFLAGS -Wl,--sysroot=/" # Since conda main channel usually lacks behind on the latest releases, # we use pypi to test against the latest releases of the dependencies. # conda is still used as a convenient way to install Python and pip. make_conda "ccache python=$PYTHON_VERSION" setup_ccache python -m pip install -U pip # Do not build scikit-image from source because it is an optional dependency python -m pip install --only-binary :all: scikit-image || true python -m pip install pandas matplotlib pyamg # do not install dependencies for lightgbm since it requires scikit-learn. python -m pip install "lightgbm>=3.0.0" --no-deps elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then # FIXME: temporary fix to link against system libraries on linux export LDFLAGS="$LDFLAGS -Wl,--sysroot=/" make_conda "ccache python=$PYTHON_VERSION" python -m pip install -U pip echo "Installing numpy and scipy master wheels" dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy pandas scipy pip install --pre cython setup_ccache echo "Installing joblib master" pip install https://github.com/joblib/joblib/archive/master.zip echo "Installing pillow master" pip install https://github.com/python-pillow/Pillow/archive/main.zip fi python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ $(get_dep pytest $PYTEST_VERSION) \ $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) if [[ "$COVERAGE" == "true" ]]; then python -m pip install codecov pytest-cov fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then python -m pip install pytest-xdist fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx python -m pip install sphinx python -m pip install numpydoc fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" python -c "\ try: import pandas print('pandas %s' % pandas.__version__) except ImportError: print('pandas not installed') " # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 python -m pip list if [[ "$DISTRIB" == "conda-pip-latest" ]]; then # Check that pip can automatically build scikit-learn with the build # dependencies specified in pyproject.toml using an isolated build # environment: pip install --verbose --editable . else if [[ "$BUILD_WITH_ICC" == "true" ]]; then wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" sudo apt-get update sudo apt-get install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic source /opt/intel/oneapi/setvars.sh # The "build_clib" command is implicitly used to build "libsvm-skl". # To compile with a different compiler, we also need to specify the # compiler for this command python setup.py build_ext --compiler=intelem -i build_clib --compiler=intelem fi # Use the pre-installed build dependencies and build directly in the # current environment. python setup.py develop fi ccache -s ================================================ FILE: build_tools/azure/install_win.sh ================================================ #!/bin/bash set -e set -x if [[ "$PYTHON_ARCH" == "64" ]]; then conda create -n $VIRTUALENV -q -y python=$PYTHON_VERSION numpy scipy cython matplotlib wheel pillow joblib source activate $VIRTUALENV pip install threadpoolctl if [[ "$PYTEST_VERSION" == "*" ]]; then pip install pytest else pip install pytest==$PYTEST_VERSION fi else pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then pip install pytest-xdist fi if [[ "$COVERAGE" == "true" ]]; then pip install coverage codecov pytest-cov fi python --version pip --version # Build scikit-learn python setup.py bdist_wheel # Install the generated wheel package to test it pip install --pre --no-index --find-links dist scikit-learn ================================================ FILE: build_tools/azure/posix-docker.yml ================================================ parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: ne(variables['Build.Reason'], 'Schedule') jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' SKLEARN_SKIP_NETWORK_TESTS: '1' NUMPY_VERSION: 'latest' SCIPY_VERSION: 'latest' CYTHON_VERSION: 'latest' JOBLIB_VERSION: 'latest' PANDAS_VERSION: 'latest' PYAMG_VERSION: 'latest' PILLOW_VERSION: 'latest' MATPLOTLIB_VERSION: 'latest' PYTEST_VERSION: 'latest' PYTEST_XDIST_VERSION: 'latest' THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'false' TEST_DOCSTRINGS: 'false' BLAS: 'openblas' # Set in azure-pipelines.yml DISTRIB: '' DOCKER_CONTAINER: '' SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: # Container is detached and sleeping, allowing steps to run commands # in the container. The TEST_DIR is mapped allowing the host to access # the JUNITXML file - script: > docker container run --rm --volume $TEST_DIR:/temp_dir --volume $PWD:/io -w /io --detach --name skcontainer -e DISTRIB=$DISTRIB -e TEST_DIR=/temp_dir -e JUNITXML=$JUNITXML -e VIRTUALENV=testvenv -e NUMPY_VERSION=$NUMPY_VERSION -e SCIPY_VERSION=$SCIPY_VERSION -e CYTHON_VERSION=$CYTHON_VERSION -e JOBLIB_VERSION=$JOBLIB_VERSION -e PANDAS_VERSION=$PANDAS_VERSION -e PYAMG_VERSION=$PYAMG_VERSION -e PILLOW_VERSION=$PILLOW_VERSION -e MATPLOTLIB_VERSION=$MATPLOTLIB_VERSION -e PYTEST_VERSION=$PYTEST_VERSION -e PYTEST_XDIST_VERSION=$PYTEST_XDIST_VERSION -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS -e BLAS=$BLAS $DOCKER_CONTAINER sleep 1000000 displayName: 'Start container' - script: > docker exec skcontainer ./build_tools/azure/install.sh displayName: 'Install' - script: > docker exec skcontainer ./build_tools/azure/test_script.sh displayName: 'Test Library' - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - script: > docker container stop skcontainer displayName: 'Stop container' condition: always() ================================================ FILE: build_tools/azure/posix.yml ================================================ parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: '' jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' SKLEARN_SKIP_NETWORK_TESTS: '1' CCACHE_DIR: $(Pipeline.Workspace)/ccache CCACHE_COMPRESS: '1' NUMPY_VERSION: 'latest' SCIPY_VERSION: 'latest' CYTHON_VERSION: 'latest' JOBLIB_VERSION: 'latest' PANDAS_VERSION: 'latest' PYAMG_VERSION: 'latest' PILLOW_VERSION: 'latest' MATPLOTLIB_VERSION: 'latest' PYTEST_VERSION: 'latest' PYTEST_XDIST_VERSION: 'latest' THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'true' TEST_DOCSTRINGS: 'false' CREATE_ISSUE_ON_TRACKER: 'false' SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH condition: startsWith(variables['DISTRIB'], 'conda') - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation condition: startsWith(variables['DISTRIB'], 'conda') - task: Cache@2 inputs: key: '"$(Agent.JobName)"' path: $(CCACHE_DIR) displayName: ccache continueOnError: true - script: | build_tools/azure/install.sh displayName: 'Install' - script: | build_tools/azure/test_script.sh displayName: 'Test Library' - script: | build_tools/azure/test_docs.sh displayName: 'Test Docs' - script: | build_tools/azure/test_docstring.sh displayName: "Numpydoc validation" condition: eq(variables['TEST_DOCSTRINGS'], 'true') - script: | build_tools/azure/test_pytest_soft_dependency.sh displayName: 'Test Soft Dependency' condition: eq(variables['CHECK_PYTEST_SOFT_DEPENDENCY'], 'true') - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() - task: UsePythonVersion@0 inputs: versionSpec: '3.9' displayName: Place Python into path to update issue tracker condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), eq(variables['Build.Reason'], 'Schedule')) - bash: | set -ex if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then echo "GitHub Token is not set. Issue tracker will not be updated." exit fi LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID" CI_NAME="$SYSTEM_JOBIDENTIFIER" ISSUE_REPO="$BUILD_REPOSITORY_NAME" pip install defusedxml PyGithub python maint_tools/create_issue_from_juint.py $(BOT_GITHUB_TOKEN) $CI_NAME $ISSUE_REPO $LINK_TO_RUN $JUNIT_FILE displayName: 'Update issue tracker' env: JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), eq(variables['Build.Reason'], 'Schedule')) - script: | build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) ================================================ FILE: build_tools/azure/test_docs.sh ================================================ #!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]]; then source $VIRTUALENV/bin/activate fi if [[ "$BUILD_WITH_ICC" == "true" ]]; then source /opt/intel/oneapi/setvars.sh fi make test-doc ================================================ FILE: build_tools/azure/test_docstring.sh ================================================ #!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]]; then source $VIRTUALENV/bin/activate fi if [[ "$BUILD_WITH_ICC" == "true" ]]; then source /opt/intel/oneapi/setvars.sh fi pytest maint_tools/test_docstrings.py ================================================ FILE: build_tools/azure/test_pytest_soft_dependency.sh ================================================ #!/bin/bash set -e # called when DISTRIB=="conda" source activate $VIRTUALENV conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # conda may remove coverage when uninstall pytest and py pip install coverage # Need to append the coverage to the existing .coverage generated by # running the tests. Make sure to reuse the same coverage # configuration as the one used by the main pytest run to be # able to combine the results. CMD="coverage run --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" else CMD="python" fi # .coverage from running the tests is in TEST_DIR pushd $TEST_DIR $CMD -m sklearn.utils.tests.test_estimator_checks popd ================================================ FILE: build_tools/azure/test_script.sh ================================================ #!/bin/bash set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "debian-32" ]]; then source $VIRTUALENV/bin/activate fi if [[ "$BUILD_WITH_ICC" == "true" ]]; then source /opt/intel/oneapi/setvars.sh fi mkdir -p $TEST_DIR cp setup.cfg $TEST_DIR cd $TEST_DIR python -c "import sklearn; sklearn.show_versions()" if ! command -v conda &> /dev/null then pip list else # conda list provides more info than pip list (when available) conda list fi TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then # Note: --cov-report= is used to disable to long text output report in the # CI logs. The coverage data is consolidated by codecov to get an online # web report across all the platforms so there is no need for this text # report that otherwise hides the test failures and forces long scrolls in # the CI logs. export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" TEST_CMD="$TEST_CMD --cov-config='$COVERAGE_PROCESS_START' --cov sklearn --cov-report=" fi if [[ -n "$CHECK_WARNINGS" ]]; then # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning" # Python 3.10 deprecates disutils and is imported by numpy interally during import time TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning" # Workaround for https://github.com/pypa/setuptools/issues/2885 TEST_CMD="$TEST_CMD -Wignore:Creating\ a\ LegacyVersion:DeprecationWarning" fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then TEST_CMD="$TEST_CMD -n2" fi if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then TEST_CMD="$TEST_CMD -ra" fi set -x eval "$TEST_CMD --pyargs sklearn" set +x ================================================ FILE: build_tools/azure/upload_codecov.sh ================================================ #!/bin/bash set -e # called when COVERAGE=="true" and DISTRIB=="conda" export PATH=$HOME/miniconda3/bin:$PATH source activate $VIRTUALENV # Need to run codecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run pushd $TEST_DIR coverage combine --append popd cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" ================================================ FILE: build_tools/azure/windows.yml ================================================ parameters: name: '' vmImage: '' matrix: [] dependsOn: [] condition: ne(variables['Build.Reason'], 'Schedule') jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} pool: vmImage: ${{ parameters.vmImage }} variables: VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' SKLEARN_SKIP_NETWORK_TESTS: '1' PYTEST_VERSION: '5.2.1' PYTEST_XDIST: 'true' PYTEST_XDIST_VERSION: 'latest' TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - bash: echo "##vso[task.prependpath]$CONDA/Scripts" displayName: Add conda to PATH for 64 bit Python condition: eq(variables['PYTHON_ARCH'], '64') - task: UsePythonVersion@0 inputs: versionSpec: '$(PYTHON_VERSION)' addToPath: true architecture: 'x86' displayName: Use 32 bit System Python condition: eq(variables['PYTHON_ARCH'], '32') - bash: ./build_tools/azure/install_win.sh displayName: 'Install' - bash: ./build_tools/azure/test_script.sh displayName: 'Test Library' - bash: ./build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) displayName: 'Upload To Codecov' env: CODECOV_TOKEN: $(CODECOV_TOKEN) - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() ================================================ FILE: build_tools/circle/build_doc.sh ================================================ #!/usr/bin/env bash set -x set -e # Decide what kind of documentation build to run, and run it. # # If the last commit message has a "[doc skip]" marker, do not build # the doc. On the contrary if a "[doc build]" marker is found, build the doc # instead of relying on the subsequent rules. # # We always build the documentation for jobs that are not related to a specific # PR (e.g. a merge to main or a maintenance branch). # # If this is a PR, do a full build if there are some files in this PR that are # under the "doc/" or "examples/" folders, otherwise perform a quick build. # # If the inspection of the current commit fails for any reason, the default # behavior is to quick build the documentation. get_build_type() { if [ -z "$CIRCLE_SHA1" ] then echo SKIP: undefined CIRCLE_SHA1 return fi commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) if [ -z "$commit_msg" ] then echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 return fi if [[ "$commit_msg" =~ \[doc\ skip\] ]] then echo SKIP: [doc skip] marker found return fi if [[ "$commit_msg" =~ \[doc\ quick\] ]] then echo QUICK: [doc quick] marker found return fi if [[ "$commit_msg" =~ \[doc\ build\] ]] then echo BUILD: [doc build] marker found return fi if [ -z "$CI_PULL_REQUEST" ] then echo BUILD: not a pull request return fi git_range="origin/main...$CIRCLE_SHA1" git fetch origin main >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) filenames=$(git diff --name-only $git_range) if [ -z "$filenames" ] then echo QUICK BUILD: no changed filenames for $git_range return fi changed_examples=$(echo "$filenames" | grep -E "^examples/(.*/)*plot_") # The following is used to extract the list of filenames of example python # files that sphinx-gallery needs to run to generate png files used as # figures or images in the .rst files from the documentation. # If the contributor changes a .rst file in a PR we need to run all # the examples mentioned in that file to get sphinx build the # documentation without generating spurious warnings related to missing # png files. if [[ -n "$filenames" ]] then # get rst files rst_files="$(echo "$filenames" | grep -E "rst$")" # get lines with figure or images img_fig_lines="$(echo "$rst_files" | xargs grep -shE "(figure|image)::")" # get only auto_examples auto_example_files="$(echo "$img_fig_lines" | grep auto_examples | awk -F "/" '{print $NF}')" # remove "sphx_glr_" from path and accept replace _(\d\d\d|thumb).png with .py scripts_names="$(echo "$auto_example_files" | sed 's/sphx_glr_//' | sed -E 's/_([[:digit:]][[:digit:]][[:digit:]]|thumb).png/.py/')" # get unique values examples_in_rst="$(echo "$scripts_names" | uniq )" fi # executed only if there are examples in the modified rst files if [[ -n "$examples_in_rst" ]] then if [[ -n "$changed_examples" ]] then changed_examples="$changed_examples|$examples_in_rst" else changed_examples="$examples_in_rst" fi fi if [[ -n "$changed_examples" ]] then echo BUILD: detected examples/ filename modified in $git_range: $changed_examples pattern=$(echo "$changed_examples" | paste -sd '|') # pattern for examples to run is the last line of output echo "$pattern" return fi echo QUICK BUILD: no examples/ filename modified in $git_range: echo "$filenames" } build_type=$(get_build_type) if [[ "$build_type" =~ ^SKIP ]] then exit 0 fi if [[ "$CIRCLE_BRANCH" =~ ^main$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]] then # ZIP linked into HTML make_args=dist elif [[ "$build_type" =~ ^QUICK ]] then make_args=html-noplot elif [[ "$build_type" =~ ^'BUILD: detected examples' ]] then # pattern for examples to run is the last line of output pattern=$(echo "$build_type" | tail -n 1) make_args="html EXAMPLES_PATTERN=$pattern" else make_args=html fi make_args="SPHINXOPTS=-T $make_args" # show full traceback on exception # Installing required system packages to support the rendering of math # notation in the HTML documentation and to optimize the image files sudo -E apt-get -yq update --allow-releaseinfo-change sudo -E apt-get -yq --no-install-suggests --no-install-recommends \ install dvipng gsfonts ccache zip optipng # deactivate circleci virtualenv and setup a miniconda env instead if [[ `type -t deactivate` ]]; then deactivate fi MINICONDA_PATH=$HOME/miniconda # Install dependencies with miniconda wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ -O miniconda.sh chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH="/usr/lib/ccache:$MINICONDA_PATH/bin:$PATH" ccache -M 512M export CCACHE_COMPRESS=1 # Old packages coming from the 'free' conda channel have been removed but we # are using them for our min-dependencies doc generation. See # https://www.anaconda.com/why-we-removed-the-free-channel-in-conda-4-7/ for # more details. if [[ "$CIRCLE_JOB" == "doc-min-dependencies" ]]; then conda config --set restore_free_channel true fi # imports get_dep source build_tools/shared.sh # packaging won't be needed once setuptools starts shipping packaging>=17.0 mamba create -n $CONDA_ENV_NAME --yes --quiet \ python="${PYTHON_VERSION:-*}" \ "$(get_dep numpy $NUMPY_VERSION)" \ "$(get_dep scipy $SCIPY_VERSION)" \ "$(get_dep cython $CYTHON_VERSION)" \ "$(get_dep matplotlib $MATPLOTLIB_VERSION)" \ "$(get_dep sphinx $SPHINX_VERSION)" \ "$(get_dep pandas $PANDAS_VERSION)" \ joblib memory_profiler packaging seaborn pillow pytest coverage source activate testenv # Pin PyWavelet to 1.1.1 that is the latest version that support our minumum # NumPy version required. If PyWavelets 1.2+ is installed, it would require # NumPy 1.17+ that trigger a bug with Pandas 0.25: # https://github.com/numpy/numpy/issues/18355#issuecomment-774610226 pip install PyWavelets==1.1.1 pip install "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)" pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)" pip install "$(get_dep sphinx-prompt $SPHINX_PROMPT_VERSION)" pip install "$(get_dep sphinxext-opengraph $SPHINXEXT_OPENGRAPH_VERSION)" # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 python setup.py develop export OMP_NUM_THREADS=1 if [[ "$CIRCLE_BRANCH" =~ ^main$ && -z "$CI_PULL_REQUEST" ]] then # List available documentation versions if on main python build_tools/circle/list_versions.py > doc/versions.rst fi # The pipefail is requested to propagate exit code set -o pipefail && cd doc && make $make_args 2>&1 | tee ~/log.txt # Insert the version warning for deployment find _build/html/stable -name "*.html" | xargs sed -i '/<\/body>/ i \ \ ' cd - set +o pipefail affected_doc_paths() { files=$(git diff --name-only origin/main...$CIRCLE_SHA1) echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/' echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/' sklearn_files=$(echo "$files" | grep '^sklearn/') if [ -n "$sklearn_files" ] then grep -hlR -f<(echo "$sklearn_files" | sed 's/^/scikit-learn\/blob\/[a-z0-9]*\//') doc/_build/html/stable/modules/generated | cut -d/ -f5- fi } affected_doc_warnings() { files=$(git diff --name-only origin/main...$CIRCLE_SHA1) # Look for sphinx warnings only in files affected by the PR if [ -n "$files" ] then for af in ${files[@]} do warn+=`grep WARNING ~/log.txt | grep $af` done fi echo "$warn" } if [ -n "$CI_PULL_REQUEST" ] then echo "The following documentation warnings may have been generated by PR #$CI_PULL_REQUEST:" warnings=$(affected_doc_warnings) if [ -z "$warnings" ] then warnings="/home/circleci/project/ no warnings" fi echo "$warnings" echo "The following documentation files may have been changed by PR #$CI_PULL_REQUEST:" affected=$(affected_doc_paths) echo "$affected" ( echo '
    ' echo "$affected" | sed 's|.*|
  • & [dev, stable]
  • |' echo '

General: Home | API Reference | Examples

' echo 'Sphinx Warnings in affected files
    ' echo "$warnings" | sed 's/\/home\/circleci\/project\//
  • /g' echo '
' ) > 'doc/_build/html/stable/_changed.html' if [ "$warnings" != "/home/circleci/project/ no warnings" ] then echo "Sphinx generated warnings when building the documentation related to files modified in this PR." echo "Please check doc/_build/html/stable/_changed.html" exit 1 fi fi ================================================ FILE: build_tools/circle/build_test_arm.sh ================================================ #!/bin/bash set -e set -x UNAMESTR=`uname` N_CORES=`nproc --all` setup_ccache() { echo "Setting up ccache" mkdir /tmp/ccache/ which ccache for name in gcc g++ cc c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++; do ln -s $(which ccache) "/tmp/ccache/${name}" done export PATH="/tmp/ccache:${PATH}" # Unset ccache limits ccache -F 0 ccache -M 0 } # imports get_dep source build_tools/shared.sh sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get update # Setup conda environment MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh" # Install Mambaforge wget $MINICONDA_URL -O mambaforge.sh MINICONDA_PATH=$HOME/miniconda chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH mamba init --all --verbose mamba update --yes conda # Create environment and install dependencies mamba create -n testenv --yes $(get_dep python $PYTHON_VERSION) source activate testenv # Use the latest by default mamba install --verbose -y ccache \ pip \ $(get_dep numpy $NUMPY_VERSION) \ $(get_dep scipy $SCIPY_VERSION) \ $(get_dep cython $CYTHON_VERSION) \ $(get_dep joblib $JOBLIB_VERSION) \ $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ $(get_dep pytest $PYTEST_VERSION) \ $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) setup_ccache if [[ "$COVERAGE" == "true" ]]; then mamba install --verbose -y codecov pytest-cov fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx mamba install --verbose -y sphinx mamba install --verbose -y numpydoc fi python --version # Set parallelism to $N_CORES + 1 to overlap IO bound tasks with CPU bound tasks on CI # workers with $N_CORES cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=$(($N_CORES + 1)) # Disable the build isolation and build in the tree so that the same folder can be # cached between CI runs. # TODO: remove the '--use-feature' flag when made obsolete in pip 21.3. pip install --verbose --no-build-isolation --use-feature=in-tree-build . # Report cache usage ccache -s --verbose mamba list # Changing directory not to have module resolution use scikit-learn source # directory but to the installed package. cd /tmp python -c "import sklearn; sklearn.show_versions()" python -m threadpoolctl --import sklearn # Test using as many workers as available cores pytest --pyargs -n $N_CORES sklearn ================================================ FILE: build_tools/circle/build_test_pypy.sh ================================================ #!/usr/bin/env bash set -x set -e # System build tools apt-get -yq update apt-get -yq install wget bzip2 build-essential ccache # Install pypy and all the scikit-learn dependencies from conda-forge. In # particular, we want to install pypy compatible binary packages for numpy and # scipy as it would be to costly to build those from source. conda install -y mamba mamba create -n pypy -y \ pypy numpy scipy cython \ joblib threadpoolctl pillow pytest \ sphinx numpydoc docutils eval "$(conda shell.bash hook)" conda activate pypy # Check that we are running PyPy instead of CPython in this environment. python --version which python python -c "import platform; assert platform.python_implementation() == 'PyPy'" # Build and install scikit-learn in dev mode ccache -M 512M export CCACHE_COMPRESS=1 export PATH=/usr/lib/ccache:$PATH export LOKY_MAX_CPU_COUNT="2" export OMP_NUM_THREADS="1" # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 pip install --no-build-isolation -e . python -m pytest sklearn ================================================ FILE: build_tools/circle/checkout_merge_commit.sh ================================================ #!/bin/bash # Add `main` branch to the update list. # Otherwise CircleCI will give us a cached one. FETCH_REFS="+main:main" # Update PR refs for testing. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" fi # Retrieve the refs. git fetch -u origin ${FETCH_REFS} # Checkout the PR merge ref. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( echo Could not fetch merge commit. >&2 echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with main. >&2; exit 1) fi # Check for merge conflicts. if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git branch --merged | grep main > /dev/null git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null fi ================================================ FILE: build_tools/circle/linting.sh ================================================ #!/bin/bash # This script is used in CircleCI to check that PRs do not add obvious # flake8 violations. It relies on two things: # - find common ancestor between branch and # scikit-learn/scikit-learn remote # - run flake8 --diff on the diff between the branch and the common # ancestor # # Additional features: # - the line numbers in Travis match the local branch on the PR # author machine. # - ./build_tools/circle/flake8_diff.sh can be run locally for quick # turn-around set -e # pipefail is necessary to propagate exit codes set -o pipefail PROJECT=scikit-learn/scikit-learn PROJECT_URL=https://github.com/$PROJECT.git # Find the remote with the project name (upstream in most cases) REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') # Add a temporary remote if needed. For example this is necessary when # Travis is configured to run in a fork. In this case 'origin' is the # fork and not the reference repo we want to diff against. if [[ -z "$REMOTE" ]]; then TMP_REMOTE=tmp_reference_upstream REMOTE=$TMP_REMOTE git remote add $REMOTE $PROJECT_URL fi echo "Remotes:" echo '--------------------------------------------------------------------------------' git remote --verbose # Travis does the git clone with a limited depth (50 at the time of # writing). This may not be enough to find the common ancestor with # $REMOTE/main so we unshallow the git checkout if [[ -a .git/shallow ]]; then echo -e '\nTrying to unshallow the repo:' echo '--------------------------------------------------------------------------------' git fetch --unshallow fi if [[ "$TRAVIS" == "true" ]]; then if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] then # In main repo, using TRAVIS_COMMIT_RANGE to test the commits # that were pushed into a branch if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then echo "New branch, no commit range from Travis so passing this test by convention" exit 0 fi COMMIT_RANGE=$TRAVIS_COMMIT_RANGE fi else # We want to fetch the code as it is in the PR branch and not # the result of the merge into main. This way line numbers # reported by Travis will match with the local code. LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST # In Travis the PR target is always origin git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF fi fi # If not using the commit range from Travis we need to find the common # ancestor between $LOCAL_BRANCH_REF and $REMOTE/main if [[ -z "$COMMIT_RANGE" ]]; then if [[ -z "$LOCAL_BRANCH_REF" ]]; then LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) fi echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" echo '--------------------------------------------------------------------------------' git --no-pager log -2 $LOCAL_BRANCH_REF REMOTE_MAIN_REF="$REMOTE/main" # Make sure that $REMOTE_MAIN_REF is a valid reference echo -e "\nFetching $REMOTE_MAIN_REF" echo '--------------------------------------------------------------------------------' git fetch $REMOTE main:refs/remotes/$REMOTE_MAIN_REF LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) REMOTE_MAIN_SHORT_HASH=$(git rev-parse --short $REMOTE_MAIN_REF) COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MAIN_REF) || \ echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MAIN_REF -q)" if [ -z "$COMMIT" ]; then exit 1 fi COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT) echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ "and $REMOTE_MAIN_REF ($REMOTE_MAIN_SHORT_HASH) is $COMMIT_SHORT_HASH:" echo '--------------------------------------------------------------------------------' git --no-pager show --no-patch $COMMIT_SHORT_HASH COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" if [[ -n "$TMP_REMOTE" ]]; then git remote remove $TMP_REMOTE fi else echo "Got the commit range from Travis: $COMMIT_RANGE" fi echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" echo '--------------------------------------------------------------------------------' # We ignore files from sklearn/externals. Unfortunately there is no # way to do it with flake8 directly (the --exclude does not seem to # work with --diff). We could use the exclude magic in the git pathspec # ':!sklearn/externals' but it is only available on git 1.9 and Travis # uses git 1.8. # We need the following command to exit with 0 hence the echo in case # there is no match MODIFIED_FILES="$(git diff --name-only $COMMIT_RANGE | grep -v 'sklearn/externals' | \ grep -v 'doc/sphinxext' || echo "no_match")" check_files() { files="$1" shift options="$*" if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options fi } if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file outside sklearn/externals and doc/sphinxext has been modified" else check_files "$MODIFIED_FILES" # check code for unused imports flake8 --exclude=sklearn/externals/ --select=F401 sklearn/ examples/ fi echo -e "No problem detected by flake8\n" # For docstrings and warnings of deprecated attributes to be rendered # properly, the property decorator must come before the deprecated decorator # (else they are treated as functions) # do not error when grep -B1 "@property" finds nothing set +e bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` if [ ! -z "$bad_deprecation_property_order" ] then echo "property decorator should come before deprecated decorator" echo "found the following occurrencies:" echo $bad_deprecation_property_order exit 1 fi # Check for default doctest directives ELLIPSIS and NORMALIZE_WHITESPACE doctest_directive="$(git grep -nw -E "# doctest\: \+(ELLIPSIS|NORMALIZE_WHITESPACE)")" if [ ! -z "$doctest_directive" ] then echo "ELLIPSIS and NORMALIZE_WHITESPACE doctest directives are enabled by default, but were found in:" echo "$doctest_directive" exit 1 fi joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")" if [ ! -z "$joblib_import" ]; then echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:" echo "$joblib_import" exit 1 fi ================================================ FILE: build_tools/circle/list_versions.py ================================================ #!/usr/bin/env python3 # List all available versions of the documentation import json import re import sys from distutils.version import LooseVersion from urllib.request import urlopen def json_urlread(url): try: return json.loads(urlopen(url).read().decode("utf8")) except Exception: print("Error reading", url, file=sys.stderr) raise def human_readable_data_quantity(quantity, multiple=1024): # https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size if quantity == 0: quantity = +0 SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] for i in "KMGTPEZY"] for suffix in SUFFIXES: if quantity < multiple or suffix == SUFFIXES[-1]: if suffix == SUFFIXES[0]: return "%d %s" % (quantity, suffix) else: return "%.1f %s" % (quantity, suffix) else: quantity /= multiple def get_file_extension(version): if "dev" in version: # The 'dev' branch should be explicitly handled return "zip" current_version = LooseVersion(version) min_zip_version = LooseVersion("0.24") return "zip" if current_version >= min_zip_version else "pdf" def get_file_size(version): api_url = ROOT_URL + "%s/_downloads" % version for path_details in json_urlread(api_url): file_extension = get_file_extension(version) file_path = f"scikit-learn-docs.{file_extension}" if path_details["name"] == file_path: return human_readable_data_quantity(path_details["size"], 1000) print(":orphan:") print() heading = "Available documentation for Scikit-learn" print(heading) print("=" * len(heading)) print() print("Web-based documentation is available for versions listed below:") print() ROOT_URL = ( "https://api.github.com/repos/scikit-learn/scikit-learn.github.io/contents/" # noqa ) RAW_FMT = "https://raw.githubusercontent.com/scikit-learn/scikit-learn.github.io/master/%s/index.html" # noqa VERSION_RE = re.compile(r"scikit-learn ([\w\.\-]+) documentation") NAMED_DIRS = ["dev", "stable"] # Gather data for each version directory, including symlinks dirs = {} symlinks = {} root_listing = json_urlread(ROOT_URL) for path_details in root_listing: name = path_details["name"] if not (name[:1].isdigit() or name in NAMED_DIRS): continue if path_details["type"] == "dir": html = urlopen(RAW_FMT % name).read().decode("utf8") version_num = VERSION_RE.search(html).group(1) file_size = get_file_size(name) dirs[name] = (version_num, file_size) if path_details["type"] == "symlink": symlinks[name] = json_urlread(path_details["_links"]["self"])["target"] # Symlinks should have same data as target for src, dst in symlinks.items(): if dst in dirs: dirs[src] = dirs[dst] # Output in order: dev, stable, decreasing other version seen = set() for name in NAMED_DIRS + sorted( (k for k in dirs if k[:1].isdigit()), key=LooseVersion, reverse=True ): version_num, file_size = dirs[name] if version_num in seen: # symlink came first continue else: seen.add(version_num) name_display = "" if name[:1].isdigit() else " (%s)" % name path = "https://scikit-learn.org/%s/" % name out = "* `Scikit-learn %s%s documentation <%s>`_" % ( version_num, name_display, path, ) if file_size is not None: file_extension = get_file_extension(version_num) out += ( f" (`{file_extension.upper()} {file_size} <{path}/" f"_downloads/scikit-learn-docs.{file_extension}>`_)" ) print(out) ================================================ FILE: build_tools/circle/push_doc.sh ================================================ #!/bin/bash # This script is meant to be called in the "deploy" step defined in # circle.yml. See https://circleci.com/docs/ for more details. # The behavior of the script is controlled by environment variable defined # in the circle.yml in the top level folder of the project. set -ex if [ -z $CIRCLE_PROJECT_USERNAME ]; then USERNAME="sklearn-ci"; else USERNAME=$CIRCLE_PROJECT_USERNAME; fi DOC_REPO="scikit-learn.github.io" GENERATED_DOC_DIR=$1 if [[ -z "$GENERATED_DOC_DIR" ]]; then echo "Need to pass directory of the generated doc as argument" echo "Usage: $0 " exit 1 fi # Absolute path needed because we use cd further down in this script GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR) if [ "$CIRCLE_BRANCH" = "main" ] then dir=dev else # Strip off .X dir="${CIRCLE_BRANCH::-2}" fi MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" cd $HOME if [ ! -d $DOC_REPO ]; then git clone --depth 1 --no-checkout "git@github.com:scikit-learn/"$DOC_REPO".git"; fi cd $DOC_REPO # check if it's a new branch echo $dir > .git/info/sparse-checkout if ! git show HEAD:$dir >/dev/null then # directory does not exist. Need to make it so sparse checkout works mkdir $dir touch $dir/index.html git add $dir fi git checkout main git reset --hard origin/main if [ -d $dir ] then git rm -rf $dir/ && rm -rf $dir/ fi cp -R $GENERATED_DOC_DIR $dir git config user.email "olivier.grisel+sklearn-ci@gmail.com" git config user.name $USERNAME git config push.default matching git add -f $dir/ git commit -m "$MSG" $dir git push echo $MSG ================================================ FILE: build_tools/codespell_ignore_words.txt ================================================ aggresive aline ba basf boun bre cach complies coo copys deine didi feld fo fpr fro fwe gool hart hist ines inout ist jaques linke lod mape mor nd nmae ocur pullrequest ro soler suh suprised te technic teh thi usal vie wan winn yau ================================================ FILE: build_tools/generate_authors_table.py ================================================ """ This script generates an html table of contributors, with names and avatars. The list is generated from scikit-learn's teams on GitHub, plus a small number of hard-coded contributors. The table should be updated for each new inclusion in the teams. Generating the table requires admin rights. """ import sys import requests import getpass import time from pathlib import Path from os import path print("user:", file=sys.stderr) user = input() token = getpass.getpass("access token:\n") auth = (user, token) LOGO_URL = "https://avatars2.githubusercontent.com/u/365630?v=4" REPO_FOLDER = Path(path.abspath(__file__)).parent.parent def get(url): for sleep_time in [10, 30, 0]: reply = requests.get(url, auth=auth) api_limit = ( "message" in reply.json() and "API rate limit exceeded" in reply.json()["message"] ) if not api_limit: break print("API rate limit exceeded, waiting..") time.sleep(sleep_time) reply.raise_for_status() return reply def get_contributors(): """Get the list of contributor profiles. Require admin rights.""" # get core devs and triage team core_devs = [] triage_team = [] comm_team = [] core_devs_id = 11523 triage_team_id = 3593183 comm_team_id = 5368696 for team_id, lst in zip( (core_devs_id, triage_team_id, comm_team_id), (core_devs, triage_team, comm_team), ): for page in [1, 2]: # 30 per page reply = get(f"https://api.github.com/teams/{team_id}/members?page={page}") lst.extend(reply.json()) # get members of scikit-learn on GitHub members = [] for page in [1, 2]: # 30 per page reply = get( "https://api.github.com/orgs/scikit-learn/members?page=%d" % (page,) ) members.extend(reply.json()) # keep only the logins core_devs = set(c["login"] for c in core_devs) triage_team = set(c["login"] for c in triage_team) comm_team = set(c["login"] for c in comm_team) members = set(c["login"] for c in members) # add missing contributors with GitHub accounts members |= {"dubourg", "mbrucher", "thouis", "jarrodmillman"} # add missing contributors without GitHub accounts members |= {"Angel Soler Gollonet"} # remove CI bots members -= {"sklearn-ci", "sklearn-lgtm", "sklearn-wheels"} triage_team -= core_devs # remove ogrisel from triage_team emeritus = members - core_devs - triage_team # get profiles from GitHub core_devs = [get_profile(login) for login in core_devs] emeritus = [get_profile(login) for login in emeritus] triage_team = [get_profile(login) for login in triage_team] comm_team = [get_profile(login) for login in comm_team] # sort by last name core_devs = sorted(core_devs, key=key) emeritus = sorted(emeritus, key=key) triage_team = sorted(triage_team, key=key) comm_team = sorted(comm_team, key=key) return core_devs, emeritus, triage_team, comm_team def get_profile(login): """Get the GitHub profile from login""" print("get profile for %s" % (login,)) try: profile = get("https://api.github.com/users/%s" % login).json() except requests.exceptions.HTTPError: return dict(name=login, avatar_url=LOGO_URL, html_url="") if profile["name"] is None: profile["name"] = profile["login"] # fix missing names missing_names = { "bthirion": "Bertrand Thirion", "dubourg": "Vincent Dubourg", "Duchesnay": "Edouard Duchesnay", "Lars": "Lars Buitinck", "MechCoder": "Manoj Kumar", } if profile["name"] in missing_names: profile["name"] = missing_names[profile["name"]] return profile def key(profile): """Get a sorting key based on the lower case last name, then firstname""" components = profile["name"].lower().split(" ") return " ".join([components[-1]] + components[:-1]) def generate_table(contributors): lines = [ ".. raw :: html\n", " ", '
', " ", ] for contributor in contributors: lines.append("
") lines.append( "
" % (contributor["html_url"], contributor["avatar_url"]) ) lines.append("

%s

" % (contributor["name"],)) lines.append("
") lines.append("
") return "\n".join(lines) def generate_list(contributors): lines = [] for contributor in contributors: lines.append("- %s" % (contributor["name"],)) return "\n".join(lines) if __name__ == "__main__": core_devs, emeritus, triage_team, comm_team = get_contributors() with open(REPO_FOLDER / "doc" / "authors.rst", "w+") as rst_file: rst_file.write(generate_table(core_devs)) with open(REPO_FOLDER / "doc" / "authors_emeritus.rst", "w+") as rst_file: rst_file.write(generate_list(emeritus)) with open(REPO_FOLDER / "doc" / "triage_team.rst", "w+") as rst_file: rst_file.write(generate_table(triage_team)) with open(REPO_FOLDER / "doc" / "communication_team.rst", "w+") as rst_file: rst_file.write(generate_table(comm_team)) ================================================ FILE: build_tools/github/Windows ================================================ # Get the Python version of the base image from a build argument ARG PYTHON_VERSION FROM winamd64/python:$PYTHON_VERSION-windowsservercore ARG WHEEL_NAME ARG CONFTEST_NAME ARG CIBW_TEST_REQUIRES # Copy and install the Windows wheel COPY $WHEEL_NAME $WHEEL_NAME COPY $CONFTEST_NAME $CONFTEST_NAME RUN pip install $env:WHEEL_NAME # Install the testing dependencies RUN pip install $env:CIBW_TEST_REQUIRES.split(" ") ================================================ FILE: build_tools/github/build_minimal_windows_image.sh ================================================ #!/bin/bash set -e set -x PYTHON_VERSION=$1 BITNESS=$2 if [[ "$BITNESS" == "32" ]]; then # 32-bit architectures are not supported # by the official Docker images: Tests will just be run # on the host (instead of the minimal Docker container). exit 0 fi TEMP_FOLDER="$HOME/AppData/Local/Temp" WHEEL_PATH=$(ls -d $TEMP_FOLDER/*/repaired_wheel/*) WHEEL_NAME=$(basename $WHEEL_PATH) cp $WHEEL_PATH $WHEEL_NAME # Dot the Python version for identyfing the base Docker image PYTHON_VERSION=$(echo ${PYTHON_VERSION:0:1}.${PYTHON_VERSION:1:2}) # Build a minimal Windows Docker image for testing the wheels docker build --build-arg PYTHON_VERSION=$PYTHON_VERSION \ --build-arg WHEEL_NAME=$WHEEL_NAME \ --build-arg CONFTEST_NAME=$CONFTEST_NAME \ --build-arg CIBW_TEST_REQUIRES="$CIBW_TEST_REQUIRES" \ -f build_tools/github/Windows \ -t scikit-learn/minimal-windows . ================================================ FILE: build_tools/github/build_source.sh ================================================ #!/bin/bash set -e set -x # Move up two levels to create the virtual # environment outside of the source folder cd ../../ python -m venv build_env source build_env/bin/activate python -m pip install numpy scipy cython python -m pip install twine cd scikit-learn/scikit-learn python setup.py sdist # Check whether the source distribution will render correctly twine check dist/*.tar.gz ================================================ FILE: build_tools/github/build_wheels.sh ================================================ #!/bin/bash set -e set -x # OpenMP is not present on macOS by default if [[ "$RUNNER_OS" == "macOS" ]]; then # Make sure to use a libomp version binary compatible with the oldest # supported version of the macos SDK as libomp will be vendored into the # scikit-learn wheels for macos. The list of binaries are in # https://packages.macports.org/libomp/. Currently, the oldest # supported macos version is: High Sierra / 10.13. When upgrading this, be # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in # wheels.yml accordingly. Note that Darwin_17 == High Sierra / 10.13. wget https://packages.macports.org/libomp/libomp-11.0.1_0+universal.darwin_17.i386-x86_64.tbz2 -O libomp.tbz2 sudo tar -C / -xvjf libomp.tbz2 opt export CC=/usr/bin/clang export CXX=/usr/bin/clang++ export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" export CFLAGS="$CFLAGS -I/opt/local/include/libomp" export CXXFLAGS="$CXXFLAGS -I/opt/local/include/libomp" export LDFLAGS="$LDFLAGS -Wl,-rpath,/opt/local/lib/libomp -L/opt/local/lib/libomp -lomp" fi # The version of the built dependencies are specified # in the pyproject.toml file, while the tests are run # against the most recent version of the dependencies python -m pip install cibuildwheel python -m cibuildwheel --output-dir wheelhouse ================================================ FILE: build_tools/github/check_build_trigger.sh ================================================ #!/bin/bash set -e set -x COMMIT_MSG=$(git log --no-merges -1 --oneline) # The commit marker "[cd build]" will trigger the build when required if [[ "$GITHUB_EVENT_NAME" == schedule || "$COMMIT_MSG" =~ \[cd\ build\] ]]; then echo "::set-output name=build::true" fi ================================================ FILE: build_tools/github/check_wheels.py ================================================ """Checks that dist/* contains the number of wheels built from the .github/workflows/wheels.yml config.""" import yaml from pathlib import Path import sys gh_wheel_path = Path.cwd() / ".github" / "workflows" / "wheels.yml" with gh_wheel_path.open("r") as f: wheel_config = yaml.safe_load(f) build_matrix = wheel_config["jobs"]["build_wheels"]["strategy"]["matrix"] n_python_versions = len(build_matrix["python"]) # For each python version we have: 7 wheels # 1 osx wheel (x86_64) # 4 linux wheel (i686 + x86_64) * (manylinux1 + manylinux2010) # 2 windows wheel (win32 + wind_amd64) n_wheels = 7 * n_python_versions # plus one more for the sdist n_wheels += 1 # aarch64 builds from travis travis_config_path = Path.cwd() / ".travis.yml" with travis_config_path.open("r") as f: travis_config = yaml.safe_load(f) jobs = travis_config["jobs"]["include"] travis_builds = [j for j in jobs if any("CIBW_BUILD" in env for env in j["env"])] n_wheels += len(travis_builds) dist_files = list(Path("dist").glob("**/*")) n_dist_files = len(dist_files) if n_dist_files != n_wheels: print( f"Expected {n_wheels} wheels in dist/* but " f"got {n_dist_files} artifacts instead." ) sys.exit(1) print(f"dist/* has the expected {n_wheels} wheels:") print("\n".join(file.name for file in dist_files)) ================================================ FILE: build_tools/github/repair_windows_wheels.sh ================================================ #!/bin/bash set -e set -x WHEEL=$1 DEST_DIR=$2 BITNESS=$3 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" WHEEL_DIRNAME=$(ls -d scikit_learn-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" "$BITNESS" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" ================================================ FILE: build_tools/github/test_source.sh ================================================ #!/bin/bash set -e set -x cd ../../ python -m venv test_env source test_env/bin/activate python -m pip install scikit-learn/scikit-learn/dist/*.tar.gz python -m pip install pytest pandas # Run the tests on the installed source distribution mkdir tmp_for_test cp scikit-learn/scikit-learn/conftest.py tmp_for_test cd tmp_for_test pytest --pyargs sklearn ================================================ FILE: build_tools/github/test_wheels.sh ================================================ #!/bin/bash set -e set -x if [[ "$OSTYPE" != "linux-gnu" ]]; then # The Linux test environment is run in a Docker container and # it is not possible to copy the test configuration file (yet) cp $CONFTEST_PATH $CONFTEST_NAME fi # Test that there are no links to system libraries in the # threadpoolctl output section of the show_versions output: python -c "import sklearn; sklearn.show_versions()" pytest --pyargs sklearn ================================================ FILE: build_tools/github/test_windows_wheels.sh ================================================ #!/bin/bash set -e set -x PYTHON_VERSION=$1 BITNESS=$2 if [[ "$BITNESS" == "32" ]]; then # 32-bit architectures use the regular # test command (outside of the minimal Docker container) cp $CONFTEST_PATH $CONFTEST_NAME python -c "import sklearn; sklearn.show_versions()" pytest --pyargs sklearn else docker container run \ --rm scikit-learn/minimal-windows \ powershell -Command "python -c 'import sklearn; sklearn.show_versions()'" docker container run \ -e SKLEARN_SKIP_NETWORK_TESTS=1 \ -e OMP_NUM_THREADS=2 \ -e OPENBLAS_NUM_THREADS=2 \ --rm scikit-learn/minimal-windows \ powershell -Command "pytest --pyargs sklearn" fi ================================================ FILE: build_tools/github/upload_anaconda.sh ================================================ #!/bin/bash set -e set -x if [ "$GITHUB_EVENT_NAME" == "schedule" ]; then ANACONDA_ORG="scipy-wheels-nightly" ANACONDA_TOKEN="$SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN" else ANACONDA_ORG="scikit-learn-wheels-staging" ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" fi # Install Python 3.8 because of a bug with Python 3.9 export PATH=$CONDA/bin:$PATH conda create -n upload -y python=3.8 source activate upload conda install -y anaconda-client # Force a replacement if the remote file already exists anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG dist/artifact/* echo "Index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" ================================================ FILE: build_tools/github/vendor.py ================================================ """Embed vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll. Note that vcruntime140_1.dll is only required (and available) for 64-bit architectures. """ import os import os.path as op import shutil import sys import textwrap TARGET_FOLDER = op.join("sklearn", ".libs") DISTRIBUTOR_INIT = op.join("sklearn", "_distributor_init.py") VCOMP140_SRC_PATH = "C:\\Windows\\System32\\vcomp140.dll" VCRUNTIME140_SRC_PATH = "C:\\Windows\\System32\\vcruntime140.dll" VCRUNTIME140_1_SRC_PATH = "C:\\Windows\\System32\\vcruntime140_1.dll" def make_distributor_init_32_bits( distributor_init, vcomp140_dll_filename, vcruntime140_dll_filename ): """Create a _distributor_init.py file for 32-bit architectures. This file is imported first when importing the sklearn package so as to pre-load the vendored vcomp140.dll and vcruntime140.dll. """ with open(distributor_init, "wt") as f: f.write( textwrap.dedent( """ '''Helper to preload vcomp140.dll and vcruntime140.dll to prevent "not found" errors. Once vcomp140.dll and vcruntime140.dll are preloaded, the namespace is made available to any subsequent vcomp140.dll and vcruntime140.dll. This is created as part of the scripts that build the wheel. ''' import os import os.path as op from ctypes import WinDLL if os.name == "nt": # Load vcomp140.dll and vcruntime140.dll libs_path = op.join(op.dirname(__file__), ".libs") vcomp140_dll_filename = op.join(libs_path, "{0}") vcruntime140_dll_filename = op.join(libs_path, "{1}") WinDLL(op.abspath(vcomp140_dll_filename)) WinDLL(op.abspath(vcruntime140_dll_filename)) """.format( vcomp140_dll_filename, vcruntime140_dll_filename ) ) ) def make_distributor_init_64_bits( distributor_init, vcomp140_dll_filename, vcruntime140_dll_filename, vcruntime140_1_dll_filename, ): """Create a _distributor_init.py file for 64-bit architectures. This file is imported first when importing the sklearn package so as to pre-load the vendored vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll. """ with open(distributor_init, "wt") as f: f.write( textwrap.dedent( """ '''Helper to preload vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll to prevent "not found" errors. Once vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll are preloaded, the namespace is made available to any subsequent vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll. This is created as part of the scripts that build the wheel. ''' import os import os.path as op from ctypes import WinDLL if os.name == "nt": # Load vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll libs_path = op.join(op.dirname(__file__), ".libs") vcomp140_dll_filename = op.join(libs_path, "{0}") vcruntime140_dll_filename = op.join(libs_path, "{1}") vcruntime140_1_dll_filename = op.join(libs_path, "{2}") WinDLL(op.abspath(vcomp140_dll_filename)) WinDLL(op.abspath(vcruntime140_dll_filename)) WinDLL(op.abspath(vcruntime140_1_dll_filename)) """.format( vcomp140_dll_filename, vcruntime140_dll_filename, vcruntime140_1_dll_filename, ) ) ) def main(wheel_dirname, bitness): """Embed vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll.""" if not op.exists(VCOMP140_SRC_PATH): raise ValueError(f"Could not find {VCOMP140_SRC_PATH}.") if not op.exists(VCRUNTIME140_SRC_PATH): raise ValueError(f"Could not find {VCRUNTIME140_SRC_PATH}.") if not op.exists(VCRUNTIME140_1_SRC_PATH) and bitness == "64": raise ValueError(f"Could not find {VCRUNTIME140_1_SRC_PATH}.") if not op.isdir(wheel_dirname): raise RuntimeError(f"Could not find {wheel_dirname} file.") vcomp140_dll_filename = op.basename(VCOMP140_SRC_PATH) vcruntime140_dll_filename = op.basename(VCRUNTIME140_SRC_PATH) vcruntime140_1_dll_filename = op.basename(VCRUNTIME140_1_SRC_PATH) target_folder = op.join(wheel_dirname, TARGET_FOLDER) distributor_init = op.join(wheel_dirname, DISTRIBUTOR_INIT) # Create the "sklearn/.libs" subfolder if not op.exists(target_folder): os.mkdir(target_folder) print(f"Copying {VCOMP140_SRC_PATH} to {target_folder}.") shutil.copy2(VCOMP140_SRC_PATH, target_folder) print(f"Copying {VCRUNTIME140_SRC_PATH} to {target_folder}.") shutil.copy2(VCRUNTIME140_SRC_PATH, target_folder) if bitness == "64": print(f"Copying {VCRUNTIME140_1_SRC_PATH} to {target_folder}.") shutil.copy2(VCRUNTIME140_1_SRC_PATH, target_folder) # Generate the _distributor_init file in the source tree print("Generating the '_distributor_init.py' file.") if bitness == "32": make_distributor_init_32_bits( distributor_init, vcomp140_dll_filename, vcruntime140_dll_filename ) else: make_distributor_init_64_bits( distributor_init, vcomp140_dll_filename, vcruntime140_dll_filename, vcruntime140_1_dll_filename, ) if __name__ == "__main__": _, wheel_file, bitness = sys.argv main(wheel_file, bitness) ================================================ FILE: build_tools/shared.sh ================================================ get_dep() { package="$1" version="$2" if [[ "$version" == "none" ]]; then # do not install with none echo elif [[ "${version%%[^0-9.]*}" ]]; then # version number is explicitly passed echo "$package==$version" elif [[ "$version" == "latest" ]]; then # use latest echo "$package" elif [[ "$version" == "min" ]]; then echo "$package==$(python sklearn/_min_dependencies.py $package)" fi } ================================================ FILE: build_tools/travis/after_success.sh ================================================ #!/bin/bash # This script is meant to be called by the "after_success" step # defined in ".travis.yml". In particular, we upload the wheels # of the ARM64 architecture for the continuous deployment jobs. set -e # The wheels cannot be uploaded on PRs if [[ $BUILD_WHEEL == true && $TRAVIS_EVENT_TYPE != pull_request ]]; then # Nightly upload token and staging upload token are set in # Travis settings (originally generated at Anaconda cloud) if [[ $TRAVIS_EVENT_TYPE == cron ]]; then ANACONDA_ORG="scipy-wheels-nightly" ANACONDA_TOKEN="$SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN" else ANACONDA_ORG="scikit-learn-wheels-staging" ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" fi MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" wget $MINICONDA_URL -O miniconda.sh MINICONDA_PATH=$HOME/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH # Install Python 3.8 because of a bug with Python 3.9 export PATH=$MINICONDA_PATH/bin:$PATH conda create -n upload -y python=3.8 source activate upload conda install -y anaconda-client # Force a replacement if the remote file already exists anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG wheelhouse/*.whl echo "Index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" fi ================================================ FILE: build_tools/travis/install.sh ================================================ #!/bin/bash # This script is meant to be called by the "install" step # defined in the ".travis.yml" file. In particular, it is # important that we call to the right installation script. if [[ $BUILD_WHEEL == true ]]; then source build_tools/travis/install_wheels.sh || travis_terminate 1 else source build_tools/travis/install_main.sh || travis_terminate 1 fi ================================================ FILE: build_tools/travis/install_main.sh ================================================ #!/bin/bash # Travis clone "scikit-learn/scikit-learn" repository into # a local repository. We use a cached directory with three # scikit-learn repositories (one for each matrix entry for # non continuous deployment jobs) from which we pull local # Travis repository. This allows us to keep build artifact # for GCC + Cython, and gain time. set -e echo "CPU Arch: $TRAVIS_CPU_ARCH." # Import "get_dep" source build_tools/shared.sh echo "List files from cached directories." echo "pip:" ls $HOME/.cache/pip export CC=/usr/lib/ccache/gcc export CXX=/usr/lib/ccache/g++ # Useful for debugging how ccache is used # export CCACHE_LOGFILE=/tmp/ccache.log # 60MB are (more or less) used by .ccache, when # compiling from scratch at the time of writing ccache --max-size 100M --show-stats # Deactivate the default virtual environment # to setup a conda-based environment instead deactivate MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" # Install Miniconda wget $MINICONDA_URL -O miniconda.sh MINICONDA_PATH=$HOME/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH conda update --yes conda # Create environment and install dependencies conda create -n testenv --yes python=3.7 source activate testenv conda install -y scipy numpy pandas cython pip install joblib threadpoolctl pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist # Build scikit-learn in this script to collapse the # verbose build output in the Travis output when it # succeeds python --version python -c "import numpy; print(f'numpy {numpy.__version__}')" python -c "import scipy; print(f'scipy {scipy.__version__}')" pip install -e . python setup.py develop ccache --show-stats # Useful for debugging how ccache is used # cat $CCACHE_LOGFILE ================================================ FILE: build_tools/travis/install_wheels.sh ================================================ #!/bin/bash python -m pip install cibuildwheel || travis_terminate $? python -m cibuildwheel --output-dir wheelhouse || travis_terminate $? ================================================ FILE: build_tools/travis/script.sh ================================================ #!/bin/bash # This script is meant to be called by the "script" step defined # in the ".travis.yml" file. While this step is forbidden by the # continuous deployment jobs, we have to execute the scripts for # testing the continuous integration jobs. if [[ $BUILD_WHEEL != true ]]; then # This trick will make Travis terminate the continuation of the pipeline bash build_tools/travis/test_script.sh || travis_terminate 1 bash build_tools/travis/test_docs.sh || travis_terminate 1 fi ================================================ FILE: build_tools/travis/test_docs.sh ================================================ #!/bin/bash set -e if [[ $TRAVIS_CPU_ARCH != arm64 ]]; then # Faster run of the documentation tests PYTEST="pytest -n $CPU_COUNT" make test-doc fi ================================================ FILE: build_tools/travis/test_script.sh ================================================ #!/bin/bash set -e python --version python -c "import numpy; print(f'numpy {numpy.__version__}')" python -c "import scipy; print(f'scipy {scipy.__version__}')" python -c "\ try: import pandas print(f'pandas {pandas.__version__}') except ImportError: pass " python -c "import joblib; print(f'{joblib.cpu_count()} CPUs')" python -c "import platform; print(f'{platform.machine()}')" TEST_CMD="pytest --showlocals --durations=20 --pyargs" # Run the tests on the installed version mkdir -p $TEST_DIR # Copy "setup.cfg" for the test settings cp setup.cfg $TEST_DIR cd $TEST_DIR if [[ $TRAVIS_CPU_ARCH == arm64 ]]; then # Faster run of the source code tests TEST_CMD="$TEST_CMD -n $CPU_COUNT" # Remove the option to test the docstring sed -i -e 's/--doctest-modules//g' setup.cfg fi if [[ -n $CHECK_WARNINGS ]]; then TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning" fi $TEST_CMD sklearn ================================================ FILE: build_tools/travis/test_wheels.sh ================================================ #!/bin/bash pip install --upgrade pip || travis_terminate $? pip install pytest pytest-xdist || travis_terminate $? # Test that there are no links to system libraries in the threadpoolctl # section of the show_versions output. python -c "import sklearn; sklearn.show_versions()" || travis_terminate $? python -m pytest -n $CPU_COUNT --pyargs sklearn || travis_terminate $? ================================================ FILE: conftest.py ================================================ # Even if empty this file is useful so that when running from the root folder # ./sklearn is added to sys.path by pytest. See # https://docs.pytest.org/en/latest/explanation/pythonpath.html for more # details. For example, this allows to build extensions in place and run pytest # doc/modules/clustering.rst and use sklearn from the local folder rather than # the one from site-packages. ================================================ FILE: doc/Makefile ================================================ # Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -j auto SPHINXBUILD ?= sphinx-build PAPER = BUILDDIR = _build ifneq ($(EXAMPLES_PATTERN),) EXAMPLES_PATTERN_OPTS := -D sphinx_gallery_conf.filename_pattern="$(EXAMPLES_PATTERN)" endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\ $(EXAMPLES_PATTERN_OPTS) . .PHONY: help clean html dirhtml ziphtml pickle json latex latexpdf changes linkcheck doctest optipng all: html-noplot help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " ziphtml to make a ZIP of the HTML" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* -rm -rf auto_examples/ -rm -rf generated/* -rm -rf modules/generated/ html: # These two lines make the build a bit more lengthy, and the # the embedding of images more robust rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." ziphtml: @if [ ! -d "$(BUILDDIR)/html/stable/" ]; then \ make html; \ fi # Optimize the images to reduce the size of the ZIP optipng $(BUILDDIR)/html/stable/_images/*.png # Exclude the output directory to avoid infinity recursion cd $(BUILDDIR)/html/stable; \ zip -q -x _downloads \ -r _downloads/scikit-learn-docs.zip . @echo @echo "Build finished. The ZIP of the HTML is in $(BUILDDIR)/html/stable/_downloads." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." make -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." download-data: python -c "from sklearn.datasets._lfw import _check_fetch_lfw; _check_fetch_lfw()" # Optimize PNG files. Needs OptiPNG. Change the -P argument to the number of # cores you have available, so -P 64 if you have a real computer ;) optipng: find _build auto_examples */generated -name '*.png' -print0 \ | xargs -0 -n 1 -P 4 optipng -o10 dist: html ziphtml ================================================ FILE: doc/README.md ================================================ # Documentation for scikit-learn This directory contains the full manual and website as displayed at http://scikit-learn.org. See http://scikit-learn.org/dev/developers/contributing.html#documentation for detailed information about the documentation. ================================================ FILE: doc/about.rst ================================================ .. _about: About us ======== History ------- This project was started in 2007 as a Google Summer of Code project by David Cournapeau. Later that year, Matthieu Brucher started work on this project as part of his thesis. In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent Michel of INRIA took leadership of the project and made the first public release, February the 1st 2010. Since then, several releases have appeared following a ~ 3-month cycle, and a thriving international community has been leading the development. Governance ---------- The decision making process and governance structure of scikit-learn is laid out in the :ref:`governance document `. Authors ------- The following people are currently core contributors to scikit-learn's development and maintenance: .. include:: authors.rst Please do not email the authors directly to ask for assistance or report issues. Instead, please see `What's the best way to ask questions about scikit-learn `_ in the FAQ. .. seealso:: :ref:`How you can contribute to the project ` Triage Team ----------- The following people are active contributors who also help with :ref:`triaging issues `, PRs, and general maintenance: .. include:: triage_team.rst Communication Team ------------------ The following people help with :ref:`communication around scikit-learn `. .. include:: communication_team.rst Emeritus Core Developers ------------------------ The following people have been active contributors in the past, but are no longer active in the project: .. include:: authors_emeritus.rst .. _citing-scikit-learn: Citing scikit-learn ------------------- If you use scikit-learn in a scientific publication, we would appreciate citations to the following paper: `Scikit-learn: Machine Learning in Python `_, Pedregosa *et al.*, JMLR 12, pp. 2825-2830, 2011. Bibtex entry:: @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } If you want to cite scikit-learn for its API or design, you may also want to consider the following paper: :arxiv:`API design for machine learning software: experiences from the scikit-learn project <1309.0238>`, Buitinck *et al.*, 2013. Bibtex entry:: @inproceedings{sklearn_api, author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and Fabian Pedregosa and Andreas Mueller and Olivier Grisel and Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort and Jaques Grobler and Robert Layton and Jake VanderPlas and Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, title = {{API} design for machine learning software: experiences from the scikit-learn project}, booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, year = {2013}, pages = {108--122}, } Artwork ------- High quality PNG and SVG logos are available in the `doc/logos/ `_ source directory. .. image:: images/scikit-learn-logo-notext.png :align: center Funding ------- Scikit-Learn is a community driven project, however institutional and private grants help to assure its sustainability. The project would like to thank the following funders. ................................... .. raw:: html
The `Members `_ of the `Scikit-Learn Consortium at Inria Foundation `_ fund Olivier Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo. .. raw:: html
.. |msn| image:: images/microsoft.png :width: 100pt :target: https://www.microsoft.com/ .. |bcg| image:: images/bcg.png :width: 100pt :target: https://www.bcg.com/beyond-consulting/bcg-gamma/default.aspx .. |axa| image:: images/axa.png :width: 50pt :target: https://www.axa.fr/ .. |bnp| image:: images/bnp.png :width: 150pt :target: https://www.bnpparibascardif.com/ .. |fujitsu| image:: images/fujitsu.png :width: 100pt :target: https://www.fujitsu.com/global/ .. |dataiku| image:: images/dataiku.png :width: 70pt :target: https://www.dataiku.com/ .. |aphp| image:: images/logo_APHP_text.png :width: 150pt :target: https://aphp.fr/ .. |inria| image:: images/inria-logo.jpg :width: 100pt :target: https://www.inria.fr .. raw:: html
.. table:: :class: sk-sponsor-table align-default +---------+----------+ | |bcg| | +---------+----------+ | | +---------+----------+ | |axa| | |bnp| | +---------+----------+ ||fujitsu|| |msn| | +---------+----------+ | | +---------+----------+ | |dataiku| | +---------+----------+ | |aphp| | +---------+----------+ | | +---------+----------+ | |inria| | +---------+----------+ .. raw:: html
........ .. raw:: html
`The University of Sydney `_ funds Joel Nothman since July 2017. .. raw:: html
.. image:: images/sydney-primary.jpeg :width: 100pt :align: center :target: https://sydney.edu.au/ .. raw:: html
.......... .. raw:: html
`Zalando SE `_ funds Adrin Jalali since August 2020. .. raw:: html
.. image:: images/zalando_logo.png :width: 100pt :align: center :target: https://corporate.zalando.com/en .. raw:: html
........... .. raw:: html
`Microsoft `_ funds Andreas Müller since 2020. .. raw:: html
.. image:: images/microsoft.png :width: 100pt :align: center :target: https://www.microsoft.com/ .. raw:: html
........... .. raw:: html
`Quansight Labs `_ funds Thomas J. Fan since 2021. .. raw:: html
.. image:: images/quansight-labs.png :width: 100pt :align: center :target: https://labs.quansight.org .. raw:: html
Past Sponsors ............. .. raw:: html
`Columbia University `_ funded Andreas Müller (2016-2020). .. raw:: html
.. image:: images/columbia.png :width: 50pt :align: center :target: https://www.columbia.edu/ .. raw:: html
........... .. raw:: html
Andreas Müller received a grant to improve scikit-learn from the `Alfred P. Sloan Foundation `_ . This grant supported the position of Nicolas Hug and Thomas J. Fan. .. raw:: html
.. image:: images/sloan_banner.png :width: 100pt :align: center :target: https://sloan.org/ .. raw:: html
............. .. raw:: html
`INRIA `_ actively supports this project. It has provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler (2012-2013) and Olivier Grisel (2013-2017) to work on this project full-time. It also hosts coding sprints and other events. .. raw:: html
.. image:: images/inria-logo.jpg :width: 100pt :align: center :target: https://www.inria.fr .. raw:: html
..................... .. raw:: html
`Paris-Saclay Center for Data Science `_ funded one year for a developer to work on the project full-time (2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the time of Joris van den Bossche (2017-2018). .. raw:: html
.. image:: images/cds-logo.png :width: 100pt :align: center :target: https://www.datascience-paris-saclay.fr/ .. raw:: html
............ .. raw:: html
`Anaconda, Inc `_ funded Adrin Jalali in 2019. .. raw:: html
.. image:: images/anaconda.png :width: 100pt :align: center :target: https://www.anaconda.com/ .. raw:: html
.......................... .. raw:: html
`NYU Moore-Sloan Data Science Environment `_ funded Andreas Mueller (2014-2016) to work on this project. The Moore-Sloan Data Science Environment also funds several students to work on the project part-time. .. raw:: html
.. image:: images/nyu_short_color.png :width: 100pt :align: center :target: https://cds.nyu.edu/mooresloan/ .. raw:: html
........................ .. raw:: html
`Télécom Paristech `_ funded Manoj Kumar (2014), Tom Dupré la Tour (2015), Raghav RV (2015-2017), Thierry Guillemot (2016-2017) and Albert Thomas (2017) to work on scikit-learn. .. raw:: html
.. image:: images/telecom.png :width: 50pt :align: center :target: https://www.telecom-paristech.fr/ .. raw:: html
..................... .. raw:: html
`The Labex DigiCosme `_ funded Nicolas Goix (2015-2016), Tom Dupré la Tour (2015-2016 and 2017-2018), Mathurin Massias (2018-2019) to work part time on scikit-learn during their PhDs. It also funded a scikit-learn coding sprint in 2015. .. raw:: html
.. image:: images/digicosme.png :width: 100pt :align: center :target: https://digicosme.lri.fr .. raw:: html
..................... .. raw:: html
`The Chan-Zuckerberg Initiative `_ funded Nicolas Hug to work full-time on scikit-learn in 2020. .. raw:: html
.. image:: images/czi_logo.svg :width: 100pt :align: center :target: https://chanzuckerberg.com .. raw:: html
...................... The following students were sponsored by `Google `_ to work on scikit-learn through the `Google Summer of Code `_ program. - 2007 - David Cournapeau - 2011 - `Vlad Niculae`_ - 2012 - `Vlad Niculae`_, Immanuel Bayer. - 2013 - Kemal Eren, Nicolas Trésegnie - 2014 - Hamzeh Alsalhi, Issam Laradji, Maheshakya Wijewardena, Manoj Kumar. - 2015 - `Raghav RV `_, Wei Xue - 2016 - `Nelson Liu `_, `YenChen Lin `_ .. _Vlad Niculae: https://vene.ro/ ................... The `NeuroDebian `_ project providing `Debian `_ packaging and contributions is supported by `Dr. James V. Haxby `_ (`Dartmouth College `_). Sprints ------- The International 2019 Paris sprint was kindly hosted by `AXA `_. Also some participants could attend thanks to the support of the `Alfred P. Sloan Foundation `_, the `Python Software Foundation `_ (PSF) and the `DATAIA Institute `_. ..................... The 2013 International Paris Sprint was made possible thanks to the support of `Télécom Paristech `_, `tinyclues `_, the `French Python Association `_ and the `Fonds de la Recherche Scientifique `_. .............. The 2011 International Granada sprint was made possible thanks to the support of the `PSF `_ and `tinyclues `_. Donating to the project ....................... If you are interested in donating to the project or to one of our code-sprints, you can use the *Paypal* button below or the `NumFOCUS Donations Page `_ (if you use the latter, please indicate that you are donating for the scikit-learn project). All donations will be handled by `NumFOCUS `_, a non-profit-organization which is managed by a board of `Scipy community members `_. NumFOCUS's mission is to foster scientific computing software, in particular in Python. As a fiscal home of scikit-learn, it ensures that money is available when needed to keep the project funded and available while in compliance with tax regulations. The received donations for the scikit-learn project mostly will go towards covering travel-expenses for code sprints, as well as towards the organization budget of the project [#f1]_. .. raw :: html


.. rubric:: Notes .. [#f1] Regarding the organization budget, in particular, we might use some of the donated funds to pay for other project expenses such as DNS, hosting or continuous integration services. Infrastructure support ---------------------- - We would also like to thank `Microsoft Azure `_, `Travis Cl `_, `CircleCl `_ for free CPU time on their Continuous Integration servers, and `Anaconda Inc. `_ for the storage they provide for our staging and nightly builds. ================================================ FILE: doc/authors.rst ================================================ .. raw :: html

Jérémie du Boisberranger


Joris Van den Bossche


Loïc Estève


Thomas J. Fan


Alexandre Gramfort


Olivier Grisel


Yaroslav Halchenko


Nicolas Hug


Adrin Jalali


Julien Jerphanion


Guillaume Lemaitre


Christian Lorentzen


Jan Hendrik Metzen


Andreas Mueller


Vlad Niculae


Joel Nothman


Hanmin Qin


Bertrand Thirion


Tom Dupré la Tour


Gael Varoquaux


Nelle Varoquaux


Roman Yurchak

================================================ FILE: doc/authors_emeritus.rst ================================================ - Mathieu Blondel - Matthieu Brucher - Lars Buitinck - David Cournapeau - Noel Dawe - Vincent Dubourg - Edouard Duchesnay - Alexander Fabisch - Virgile Fritsch - Satrajit Ghosh - Angel Soler Gollonet - Chris Gorgolewski - Jaques Grobler - Brian Holt - Arnaud Joly - Thouis (Ray) Jones - Kyle Kastner - manoj kumar - Robert Layton - Wei Li - Paolo Losi - Gilles Louppe - Vincent Michel - Jarrod Millman - Alexandre Passos - Fabian Pedregosa - Peter Prettenhofer - (Venkat) Raghav, Rajagopalan - Jacob Schreiber - Du Shiqiao - Jake Vanderplas - David Warde-Farley - Ron Weiss ================================================ FILE: doc/binder/requirements.txt ================================================ # A binder requirement file is required by sphinx-gallery. # We don't really need one since our binder requirement file lives in the # .binder directory. # This file can be removed if 'dependencies' is made an optional key for # binder in sphinx-gallery. ================================================ FILE: doc/common_pitfalls.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True .. include:: includes/big_toc_css.rst .. _common_pitfalls: ========================================= Common pitfalls and recommended practices ========================================= The purpose of this chapter is to illustrate some common pitfalls and anti-patterns that occur when using scikit-learn. It provides examples of what **not** to do, along with a corresponding correct example. Inconsistent preprocessing ========================== scikit-learn provides a library of :ref:`data-transforms`, which may clean (see :ref:`preprocessing`), reduce (see :ref:`data_reduction`), expand (see :ref:`kernel_approximation`) or generate (see :ref:`feature_extraction`) feature representations. If these data transforms are used when training a model, they also must be used on subsequent datasets, whether it's test data or data in a production system. Otherwise, the feature space will change, and the model will not be able to perform effectively. For the following example, let's create a synthetic dataset with a single feature:: >>> from sklearn.datasets import make_regression >>> from sklearn.model_selection import train_test_split >>> random_state = 42 >>> X, y = make_regression(random_state=random_state, n_features=1, noise=1) >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, test_size=0.4, random_state=random_state) **Wrong** The train dataset is scaled, but not the test dataset, so model performance on the test dataset is worse than expected:: >>> from sklearn.metrics import mean_squared_error >>> from sklearn.linear_model import LinearRegression >>> from sklearn.preprocessing import StandardScaler >>> scaler = StandardScaler() >>> X_train_transformed = scaler.fit_transform(X_train) >>> model = LinearRegression().fit(X_train_transformed, y_train) >>> mean_squared_error(y_test, model.predict(X_test)) 62.80... **Right** Instead of passing the non-transformed `X_test` to `predict`, we should transform the test data, the same way we transformed the training data:: >>> X_test_transformed = scaler.transform(X_test) >>> mean_squared_error(y_test, model.predict(X_test_transformed)) 0.90... Alternatively, we recommend using a :class:`Pipeline `, which makes it easier to chain transformations with estimators, and reduces the possibility of forgetting a transformation:: >>> from sklearn.pipeline import make_pipeline >>> model = make_pipeline(StandardScaler(), LinearRegression()) >>> model.fit(X_train, y_train) Pipeline(steps=[('standardscaler', StandardScaler()), ('linearregression', LinearRegression())]) >>> mean_squared_error(y_test, model.predict(X_test)) 0.90... Pipelines also help avoiding another common pitfall: leaking the test data into the training data. .. _data_leakage: Data leakage ============ Data leakage occurs when information that would not be available at prediction time is used when building the model. This results in overly optimistic performance estimates, for example from :ref:`cross-validation `, and thus poorer performance when the model is used on actually novel data, for example during production. A common cause is not keeping the test and train data subsets separate. Test data should never be used to make choices about the model. **The general rule is to never call** `fit` **on the test data**. While this may sound obvious, this is easy to miss in some cases, for example when applying certain pre-processing steps. Although both train and test data subsets should receive the same preprocessing transformation (as described in the previous section), it is important that these transformations are only learnt from the training data. For example, if you have a normalization step where you divide by the average value, the average should be the average of the train subset, **not** the average of all the data. If the test subset is included in the average calculation, information from the test subset is influencing the model. An example of data leakage during preprocessing is detailed below. Data leakage during pre-processing ---------------------------------- .. note:: We here choose to illustrate data leakage with a feature selection step. This risk of leakage is however relevant with almost all transformations in scikit-learn, including (but not limited to) :class:`~sklearn.preprocessing.StandardScaler`, :class:`~sklearn.impute.SimpleImputer`, and :class:`~sklearn.decomposition.PCA`. A number of :ref:`feature_selection` functions are available in scikit-learn. They can help remove irrelevant, redundant and noisy features as well as improve your model build time and performance. As with any other type of preprocessing, feature selection should **only** use the training data. Including the test data in feature selection will optimistically bias your model. To demonstrate we will create this binary classification problem with 10,000 randomly generated features:: >>> import numpy as np >>> n_samples, n_features, n_classes = 200, 10000, 2 >>> rng = np.random.RandomState(42) >>> X = rng.standard_normal((n_samples, n_features)) >>> y = rng.choice(n_classes, n_samples) **Wrong** Using all the data to perform feature selection results in an accuracy score much higher than chance, even though our targets are completely random. This randomness means that our `X` and `y` are independent and we thus expect the accuracy to be around 0.5. However, since the feature selection step 'sees' the test data, the model has an unfair advantage. In the incorrect example below we first use all the data for feature selection and then split the data into training and test subsets for model fitting. The result is a much higher than expected accuracy score:: >>> from sklearn.model_selection import train_test_split >>> from sklearn.feature_selection import SelectKBest >>> from sklearn.ensemble import GradientBoostingClassifier >>> from sklearn.metrics import accuracy_score >>> # Incorrect preprocessing: the entire data is transformed >>> X_selected = SelectKBest(k=25).fit_transform(X, y) >>> X_train, X_test, y_train, y_test = train_test_split( ... X_selected, y, random_state=42) >>> gbc = GradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train, y_train) GradientBoostingClassifier(random_state=1) >>> y_pred = gbc.predict(X_test) >>> accuracy_score(y_test, y_pred) 0.76 **Right** To prevent data leakage, it is good practice to split your data into train and test subsets **first**. Feature selection can then be formed using just the train dataset. Notice that whenever we use `fit` or `fit_transform`, we only use the train dataset. The score is now what we would expect for the data, close to chance:: >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, random_state=42) >>> select = SelectKBest(k=25) >>> X_train_selected = select.fit_transform(X_train, y_train) >>> gbc = GradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train_selected, y_train) GradientBoostingClassifier(random_state=1) >>> X_test_selected = select.transform(X_test) >>> y_pred = gbc.predict(X_test_selected) >>> accuracy_score(y_test, y_pred) 0.46 Here again, we recommend using a :class:`~sklearn.pipeline.Pipeline` to chain together the feature selection and model estimators. The pipeline ensures that only the training data is used when performing `fit` and the test data is used only for calculating the accuracy score:: >>> from sklearn.pipeline import make_pipeline >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, random_state=42) >>> pipeline = make_pipeline(SelectKBest(k=25), ... GradientBoostingClassifier(random_state=1)) >>> pipeline.fit(X_train, y_train) Pipeline(steps=[('selectkbest', SelectKBest(k=25)), ('gradientboostingclassifier', GradientBoostingClassifier(random_state=1))]) >>> y_pred = pipeline.predict(X_test) >>> accuracy_score(y_test, y_pred) 0.46 The pipeline can also be fed into a cross-validation function such as :func:`~sklearn.model_selection.cross_val_score`. Again, the pipeline ensures that the correct data subset and estimator method is used during fitting and predicting:: >>> from sklearn.model_selection import cross_val_score >>> scores = cross_val_score(pipeline, X, y) >>> print(f"Mean accuracy: {scores.mean():.2f}+/-{scores.std():.2f}") Mean accuracy: 0.45+/-0.07 How to avoid data leakage ------------------------- Below are some tips on avoiding data leakage: * Always split the data into train and test subsets first, particularly before any preprocessing steps. * Never include test data when using the `fit` and `fit_transform` methods. Using all the data, e.g., `fit(X)`, can result in overly optimistic scores. Conversely, the `transform` method should be used on both train and test subsets as the same preprocessing should be applied to all the data. This can be achieved by using `fit_transform` on the train subset and `transform` on the test subset. * The scikit-learn :ref:`pipeline ` is a great way to prevent data leakage as it ensures that the appropriate method is performed on the correct data subset. The pipeline is ideal for use in cross-validation and hyper-parameter tuning functions. .. _randomness: Controlling randomness ====================== Some scikit-learn objects are inherently random. These are usually estimators (e.g. :class:`~sklearn.ensemble.RandomForestClassifier`) and cross-validation splitters (e.g. :class:`~sklearn.model_selection.KFold`). The randomness of these objects is controlled via their `random_state` parameter, as described in the :term:`Glossary `. This section expands on the glossary entry, and describes good practices and common pitfalls w.r.t. to this subtle parameter. .. note:: Recommendation summary For an optimal robustness of cross-validation (CV) results, pass `RandomState` instances when creating estimators, or leave `random_state` to `None`. Passing integers to CV splitters is usually the safest option and is preferable; passing `RandomState` instances to splitters may sometimes be useful to achieve very specific use-cases. For both estimators and splitters, passing an integer vs passing an instance (or `None`) leads to subtle but significant differences, especially for CV procedures. These differences are important to understand when reporting results. For reproducible results across executions, remove any use of `random_state=None`. Using `None` or `RandomState` instances, and repeated calls to `fit` and `split` -------------------------------------------------------------------------------- The `random_state` parameter determines whether multiple calls to :term:`fit` (for estimators) or to :term:`split` (for CV splitters) will produce the same results, according to these rules: - If an integer is passed, calling `fit` or `split` multiple times always yields the same results. - If `None` or a `RandomState` instance is passed: `fit` and `split` will yield different results each time they are called, and the succession of calls explores all sources of entropy. `None` is the default value for all `random_state` parameters. We here illustrate these rules for both estimators and CV splitters. .. note:: Since passing `random_state=None` is equivalent to passing the global `RandomState` instance from `numpy` (`random_state=np.random.mtrand._rand`), we will not explicitly mention `None` here. Everything that applies to instances also applies to using `None`. Estimators .......... Passing instances means that calling `fit` multiple times will not yield the same results, even if the estimator is fitted on the same data and with the same hyper-parameters:: >>> from sklearn.linear_model import SGDClassifier >>> from sklearn.datasets import make_classification >>> import numpy as np >>> rng = np.random.RandomState(0) >>> X, y = make_classification(n_features=5, random_state=rng) >>> sgd = SGDClassifier(random_state=rng) >>> sgd.fit(X, y).coef_ array([[ 8.85418642, 4.79084103, -3.13077794, 8.11915045, -0.56479934]]) >>> sgd.fit(X, y).coef_ array([[ 6.70814003, 5.25291366, -7.55212743, 5.18197458, 1.37845099]]) We can see from the snippet above that repeatedly calling `sgd.fit` has produced different models, even if the data was the same. This is because the Random Number Generator (RNG) of the estimator is consumed (i.e. mutated) when `fit` is called, and this mutated RNG will be used in the subsequent calls to `fit`. In addition, the `rng` object is shared across all objects that use it, and as a consequence, these objects become somewhat inter-dependent. For example, two estimators that share the same `RandomState` instance will influence each other, as we will see later when we discuss cloning. This point is important to keep in mind when debugging. If we had passed an integer to the `random_state` parameter of the :class:`~sklearn.ensemble.RandomForestClassifier`, we would have obtained the same models, and thus the same scores each time. When we pass an integer, the same RNG is used across all calls to `fit`. What internally happens is that even though the RNG is consumed when `fit` is called, it is always reset to its original state at the beginning of `fit`. CV splitters ............ Randomized CV splitters have a similar behavior when a `RandomState` instance is passed; calling `split` multiple times yields different data splits:: >>> from sklearn.model_selection import KFold >>> import numpy as np >>> X = y = np.arange(10) >>> rng = np.random.RandomState(0) >>> cv = KFold(n_splits=2, shuffle=True, random_state=rng) >>> for train, test in cv.split(X, y): ... print(train, test) [0 3 5 6 7] [1 2 4 8 9] [1 2 4 8 9] [0 3 5 6 7] >>> for train, test in cv.split(X, y): ... print(train, test) [0 4 6 7 8] [1 2 3 5 9] [1 2 3 5 9] [0 4 6 7 8] We can see that the splits are different from the second time `split` is called. This may lead to unexpected results if you compare the performance of multiple estimators by calling `split` many times, as we will see in the next section. Common pitfalls and subtleties ------------------------------ While the rules that govern the `random_state` parameter are seemingly simple, they do however have some subtle implications. In some cases, this can even lead to wrong conclusions. Estimators .......... **Different `random_state` types lead to different cross-validation procedures** Depending on the type of the `random_state` parameter, estimators will behave differently, especially in cross-validation procedures. Consider the following snippet:: >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import cross_val_score >>> import numpy as np >>> X, y = make_classification(random_state=0) >>> rf_123 = RandomForestClassifier(random_state=123) >>> cross_val_score(rf_123, X, y) array([0.85, 0.95, 0.95, 0.9 , 0.9 ]) >>> rf_inst = RandomForestClassifier(random_state=np.random.RandomState(0)) >>> cross_val_score(rf_inst, X, y) array([0.9 , 0.95, 0.95, 0.9 , 0.9 ]) We see that the cross-validated scores of `rf_123` and `rf_inst` are different, as should be expected since we didn't pass the same `random_state` parameter. However, the difference between these scores is more subtle than it looks, and **the cross-validation procedures that were performed by** :func:`~sklearn.model_selection.cross_val_score` **significantly differ in each case**: - Since `rf_123` was passed an integer, every call to `fit` uses the same RNG: this means that all random characteristics of the random forest estimator will be the same for each of the 5 folds of the CV procedure. In particular, the (randomly chosen) subset of features of the estimator will be the same across all folds. - Since `rf_inst` was passed a `RandomState` instance, each call to `fit` starts from a different RNG. As a result, the random subset of features will be different for each folds. While having a constant estimator RNG across folds isn't inherently wrong, we usually want CV results that are robust w.r.t. the estimator's randomness. As a result, passing an instance instead of an integer may be preferable, since it will allow the estimator RNG to vary for each fold. .. note:: Here, :func:`~sklearn.model_selection.cross_val_score` will use a non-randomized CV splitter (as is the default), so both estimators will be evaluated on the same splits. This section is not about variability in the splits. Also, whether we pass an integer or an instance to :func:`~sklearn.datasets.make_classification` isn't relevant for our illustration purpose: what matters is what we pass to the :class:`~sklearn.ensemble.RandomForestClassifier` estimator. **Cloning** Another subtle side effect of passing `RandomState` instances is how :func:`~sklearn.clone` will work:: >>> from sklearn import clone >>> from sklearn.ensemble import RandomForestClassifier >>> import numpy as np >>> rng = np.random.RandomState(0) >>> a = RandomForestClassifier(random_state=rng) >>> b = clone(a) Since a `RandomState` instance was passed to `a`, `a` and `b` are not clones in the strict sense, but rather clones in the statistical sense: `a` and `b` will still be different models, even when calling `fit(X, y)` on the same data. Moreover, `a` and `b` will influence each-other since they share the same internal RNG: calling `a.fit` will consume `b`'s RNG, and calling `b.fit` will consume `a`'s RNG, since they are the same. This bit is true for any estimators that share a `random_state` parameter; it is not specific to clones. If an integer were passed, `a` and `b` would be exact clones and they would not influence each other. .. warning:: Even though :func:`~sklearn.clone` is rarely used in user code, it is called pervasively throughout scikit-learn codebase: in particular, most meta-estimators that accept non-fitted estimators call :func:`~sklearn.clone` internally (:class:`~sklearn.model_selection.GridSearchCV`, :class:`~sklearn.ensemble.StackingClassifier`, :class:`~sklearn.calibration.CalibratedClassifierCV`, etc.). CV splitters ............ When passed a `RandomState` instance, CV splitters yield different splits each time `split` is called. When comparing different estimators, this can lead to overestimating the variance of the difference in performance between the estimators:: >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.discriminant_analysis import LinearDiscriminantAnalysis >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import KFold >>> from sklearn.model_selection import cross_val_score >>> import numpy as np >>> rng = np.random.RandomState(0) >>> X, y = make_classification(random_state=rng) >>> cv = KFold(shuffle=True, random_state=rng) >>> lda = LinearDiscriminantAnalysis() >>> nb = GaussianNB() >>> for est in (lda, nb): ... print(cross_val_score(est, X, y, cv=cv)) [0.8 0.75 0.75 0.7 0.85] [0.85 0.95 0.95 0.85 0.95] Directly comparing the performance of the :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis` estimator vs the :class:`~sklearn.naive_bayes.GaussianNB` estimator **on each fold** would be a mistake: **the splits on which the estimators are evaluated are different**. Indeed, :func:`~sklearn.model_selection.cross_val_score` will internally call `cv.split` on the same :class:`~sklearn.model_selection.KFold` instance, but the splits will be different each time. This is also true for any tool that performs model selection via cross-validation, e.g. :class:`~sklearn.model_selection.GridSearchCV` and :class:`~sklearn.model_selection.RandomizedSearchCV`: scores are not comparable fold-to-fold across different calls to `search.fit`, since `cv.split` would have been called multiple times. Within a single call to `search.fit`, however, fold-to-fold comparison is possible since the search estimator only calls `cv.split` once. For comparable fold-to-fold results in all scenarios, one should pass an integer to the CV splitter: `cv = KFold(shuffle=True, random_state=0)`. .. note:: While fold-to-fold comparison is not advisable with `RandomState` instances, one can however expect that average scores allow to conclude whether one estimator is better than another, as long as enough folds and data are used. .. note:: What matters in this example is what was passed to :class:`~sklearn.model_selection.KFold`. Whether we pass a `RandomState` instance or an integer to :func:`~sklearn.datasets.make_classification` is not relevant for our illustration purpose. Also, neither :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis` nor :class:`~sklearn.naive_bayes.GaussianNB` are randomized estimators. General recommendations ----------------------- Getting reproducible results across multiple executions ....................................................... In order to obtain reproducible (i.e. constant) results across multiple *program executions*, we need to remove all uses of `random_state=None`, which is the default. The recommended way is to declare a `rng` variable at the top of the program, and pass it down to any object that accepts a `random_state` parameter:: >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> import numpy as np >>> rng = np.random.RandomState(0) >>> X, y = make_classification(random_state=rng) >>> rf = RandomForestClassifier(random_state=rng) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=rng) >>> rf.fit(X_train, y_train).score(X_test, y_test) 0.84 We are now guaranteed that the result of this script will always be 0.84, no matter how many times we run it. Changing the global `rng` variable to a different value should affect the results, as expected. It is also possible to declare the `rng` variable as an integer. This may however lead to less robust cross-validation results, as we will see in the next section. .. note:: We do not recommend setting the global `numpy` seed by calling `np.random.seed(0)`. See `here `_ for a discussion. Robustness of cross-validation results ...................................... When we evaluate a randomized estimator performance by cross-validation, we want to make sure that the estimator can yield accurate predictions for new data, but we also want to make sure that the estimator is robust w.r.t. its random initialization. For example, we would like the random weights initialization of a :class:`~sklearn.linear_model.SGDCLassifier` to be consistently good across all folds: otherwise, when we train that estimator on new data, we might get unlucky and the random initialization may lead to bad performance. Similarly, we want a random forest to be robust w.r.t the set of randomly selected features that each tree will be using. For these reasons, it is preferable to evaluate the cross-validation performance by letting the estimator use a different RNG on each fold. This is done by passing a `RandomState` instance (or `None`) to the estimator initialization. When we pass an integer, the estimator will use the same RNG on each fold: if the estimator performs well (or bad), as evaluated by CV, it might just be because we got lucky (or unlucky) with that specific seed. Passing instances leads to more robust CV results, and makes the comparison between various algorithms fairer. It also helps limiting the temptation to treat the estimator's RNG as a hyper-parameter that can be tuned. Whether we pass `RandomState` instances or integers to CV splitters has no impact on robustness, as long as `split` is only called once. When `split` is called multiple times, fold-to-fold comparison isn't possible anymore. As a result, passing integer to CV splitters is usually safer and covers most use-cases. ================================================ FILE: doc/communication_team.rst ================================================ .. raw :: html

Reshama Shaikh


Lauren Burke

================================================ FILE: doc/computing/computational_performance.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True .. _computational_performance: .. currentmodule:: sklearn Computational Performance ========================= For some applications the performance (mainly latency and throughput at prediction time) of estimators is crucial. It may also be of interest to consider the training throughput but this is often less important in a production setup (where it often takes place offline). We will review here the orders of magnitude you can expect from a number of scikit-learn estimators in different contexts and provide some tips and tricks for overcoming performance bottlenecks. Prediction latency is measured as the elapsed time necessary to make a prediction (e.g. in micro-seconds). Latency is often viewed as a distribution and operations engineers often focus on the latency at a given percentile of this distribution (e.g. the 90 percentile). Prediction throughput is defined as the number of predictions the software can deliver in a given amount of time (e.g. in predictions per second). An important aspect of performance optimization is also that it can hurt prediction accuracy. Indeed, simpler models (e.g. linear instead of non-linear, or with fewer parameters) often run faster but are not always able to take into account the same exact properties of the data as more complex ones. Prediction Latency ------------------ One of the most straight-forward concerns one may have when using/choosing a machine learning toolkit is the latency at which predictions can be made in a production environment. The main factors that influence the prediction latency are 1. Number of features 2. Input data representation and sparsity 3. Model complexity 4. Feature extraction A last major parameter is also the possibility to do predictions in bulk or one-at-a-time mode. Bulk versus Atomic mode ........................ In general doing predictions in bulk (many instances at the same time) is more efficient for a number of reasons (branching predictability, CPU cache, linear algebra libraries optimizations etc.). Here we see on a setting with few features that independently of estimator choice the bulk mode is always faster, and for some of them by 1 to 2 orders of magnitude: .. |atomic_prediction_latency| image:: ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_001.png :target: ../auto_examples/applications/plot_prediction_latency.html :scale: 80 .. centered:: |atomic_prediction_latency| .. |bulk_prediction_latency| image:: ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_002.png :target: ../auto_examples/applications/plot_prediction_latency.html :scale: 80 .. centered:: |bulk_prediction_latency| To benchmark different estimators for your case you can simply change the ``n_features`` parameter in this example: :ref:`sphx_glr_auto_examples_applications_plot_prediction_latency.py`. This should give you an estimate of the order of magnitude of the prediction latency. Configuring Scikit-learn for reduced validation overhead ......................................................... Scikit-learn does some validation on data that increases the overhead per call to ``predict`` and similar functions. In particular, checking that features are finite (not NaN or infinite) involves a full pass over the data. If you ensure that your data is acceptable, you may suppress checking for finiteness by setting the environment variable ``SKLEARN_ASSUME_FINITE`` to a non-empty string before importing scikit-learn, or configure it in Python with :func:`set_config`. For more control than these global settings, a :func:`config_context` allows you to set this configuration within a specified context:: >>> import sklearn >>> with sklearn.config_context(assume_finite=True): ... pass # do learning/prediction here with reduced validation Note that this will affect all uses of :func:`~utils.assert_all_finite` within the context. Influence of the Number of Features .................................... Obviously when the number of features increases so does the memory consumption of each example. Indeed, for a matrix of :math:`M` instances with :math:`N` features, the space complexity is in :math:`O(NM)`. From a computing perspective it also means that the number of basic operations (e.g., multiplications for vector-matrix products in linear models) increases too. Here is a graph of the evolution of the prediction latency with the number of features: .. |influence_of_n_features_on_latency| image:: ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_003.png :target: ../auto_examples/applications/plot_prediction_latency.html :scale: 80 .. centered:: |influence_of_n_features_on_latency| Overall you can expect the prediction time to increase at least linearly with the number of features (non-linear cases can happen depending on the global memory footprint and estimator). Influence of the Input Data Representation ........................................... Scipy provides sparse matrix data structures which are optimized for storing sparse data. The main feature of sparse formats is that you don't store zeros so if your data is sparse then you use much less memory. A non-zero value in a sparse (`CSR or CSC `_) representation will only take on average one 32bit integer position + the 64 bit floating point value + an additional 32bit per row or column in the matrix. Using sparse input on a dense (or sparse) linear model can speedup prediction by quite a bit as only the non zero valued features impact the dot product and thus the model predictions. Hence if you have 100 non zeros in 1e6 dimensional space, you only need 100 multiply and add operation instead of 1e6. Calculation over a dense representation, however, may leverage highly optimised vector operations and multithreading in BLAS, and tends to result in fewer CPU cache misses. So the sparsity should typically be quite high (10% non-zeros max, to be checked depending on the hardware) for the sparse input representation to be faster than the dense input representation on a machine with many CPUs and an optimized BLAS implementation. Here is sample code to test the sparsity of your input:: def sparsity_ratio(X): return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1]) print("input sparsity ratio:", sparsity_ratio(X)) As a rule of thumb you can consider that if the sparsity ratio is greater than 90% you can probably benefit from sparse formats. Check Scipy's sparse matrix formats `documentation `_ for more information on how to build (or convert your data to) sparse matrix formats. Most of the time the ``CSR`` and ``CSC`` formats work best. Influence of the Model Complexity .................................. Generally speaking, when model complexity increases, predictive power and latency are supposed to increase. Increasing predictive power is usually interesting, but for many applications we would better not increase prediction latency too much. We will now review this idea for different families of supervised models. For :mod:`sklearn.linear_model` (e.g. Lasso, ElasticNet, SGDClassifier/Regressor, Ridge & RidgeClassifier, PassiveAggressiveClassifier/Regressor, LinearSVC, LogisticRegression...) the decision function that is applied at prediction time is the same (a dot product) , so latency should be equivalent. Here is an example using :class:`~linear_model.SGDClassifier` with the ``elasticnet`` penalty. The regularization strength is globally controlled by the ``alpha`` parameter. With a sufficiently high ``alpha``, one can then increase the ``l1_ratio`` parameter of ``elasticnet`` to enforce various levels of sparsity in the model coefficients. Higher sparsity here is interpreted as less model complexity as we need fewer coefficients to describe it fully. Of course sparsity influences in turn the prediction time as the sparse dot-product takes time roughly proportional to the number of non-zero coefficients. .. |en_model_complexity| image:: ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_001.png :target: ../auto_examples/applications/plot_model_complexity_influence.html :scale: 80 .. centered:: |en_model_complexity| For the :mod:`sklearn.svm` family of algorithms with a non-linear kernel, the latency is tied to the number of support vectors (the fewer the faster). Latency and throughput should (asymptotically) grow linearly with the number of support vectors in a SVC or SVR model. The kernel will also influence the latency as it is used to compute the projection of the input vector once per support vector. In the following graph the ``nu`` parameter of :class:`~svm.NuSVR` was used to influence the number of support vectors. .. |nusvr_model_complexity| image:: ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_002.png :target: ../auto_examples/applications/plot_model_complexity_influence.html :scale: 80 .. centered:: |nusvr_model_complexity| For :mod:`sklearn.ensemble` of trees (e.g. RandomForest, GBT, ExtraTrees etc) the number of trees and their depth play the most important role. Latency and throughput should scale linearly with the number of trees. In this case we used directly the ``n_estimators`` parameter of :class:`~ensemble.GradientBoostingRegressor`. .. |gbt_model_complexity| image:: ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_003.png :target: ../auto_examples/applications/plot_model_complexity_influence.html :scale: 80 .. centered:: |gbt_model_complexity| In any case be warned that decreasing model complexity can hurt accuracy as mentioned above. For instance a non-linearly separable problem can be handled with a speedy linear model but prediction power will very likely suffer in the process. Feature Extraction Latency .......................... Most scikit-learn models are usually pretty fast as they are implemented either with compiled Cython extensions or optimized computing libraries. On the other hand, in many real world applications the feature extraction process (i.e. turning raw data like database rows or network packets into numpy arrays) governs the overall prediction time. For example on the Reuters text classification task the whole preparation (reading and parsing SGML files, tokenizing the text and hashing it into a common vector space) is taking 100 to 500 times more time than the actual prediction code, depending on the chosen model. .. |prediction_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_004.png :target: ../auto_examples/applications/plot_out_of_core_classification.html :scale: 80 .. centered:: |prediction_time| In many cases it is thus recommended to carefully time and profile your feature extraction code as it may be a good place to start optimizing when your overall latency is too slow for your application. Prediction Throughput ---------------------- Another important metric to care about when sizing production systems is the throughput i.e. the number of predictions you can make in a given amount of time. Here is a benchmark from the :ref:`sphx_glr_auto_examples_applications_plot_prediction_latency.py` example that measures this quantity for a number of estimators on synthetic data: .. |throughput_benchmark| image:: ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_004.png :target: ../auto_examples/applications/plot_prediction_latency.html :scale: 80 .. centered:: |throughput_benchmark| These throughputs are achieved on a single process. An obvious way to increase the throughput of your application is to spawn additional instances (usually processes in Python because of the `GIL `_) that share the same model. One might also add machines to spread the load. A detailed explanation on how to achieve this is beyond the scope of this documentation though. Tips and Tricks ---------------- Linear algebra libraries ......................... As scikit-learn relies heavily on Numpy/Scipy and linear algebra in general it makes sense to take explicit care of the versions of these libraries. Basically, you ought to make sure that Numpy is built using an optimized `BLAS `_ / `LAPACK `_ library. Not all models benefit from optimized BLAS and Lapack implementations. For instance models based on (randomized) decision trees typically do not rely on BLAS calls in their inner loops, nor do kernel SVMs (``SVC``, ``SVR``, ``NuSVC``, ``NuSVR``). On the other hand a linear model implemented with a BLAS DGEMM call (via ``numpy.dot``) will typically benefit hugely from a tuned BLAS implementation and lead to orders of magnitude speedup over a non-optimized BLAS. You can display the BLAS / LAPACK implementation used by your NumPy / SciPy / scikit-learn install with the following commands:: from numpy.distutils.system_info import get_info print(get_info('blas_opt')) print(get_info('lapack_opt')) Optimized BLAS / LAPACK implementations include: - Atlas (need hardware specific tuning by rebuilding on the target machine) - OpenBLAS - MKL - Apple Accelerate and vecLib frameworks (OSX only) More information can be found on the `Scipy install page `_ and in this `blog post `_ from Daniel Nouri which has some nice step by step install instructions for Debian / Ubuntu. .. _working_memory: Limiting Working Memory ........................ Some calculations when implemented using standard numpy vectorized operations involve using a large amount of temporary memory. This may potentially exhaust system memory. Where computations can be performed in fixed-memory chunks, we attempt to do so, and allow the user to hint at the maximum size of this working memory (defaulting to 1GB) using :func:`set_config` or :func:`config_context`. The following suggests to limit temporary working memory to 128 MiB:: >>> import sklearn >>> with sklearn.config_context(working_memory=128): ... pass # do chunked work here An example of a chunked operation adhering to this setting is :func:`~metrics.pairwise_distances_chunked`, which facilitates computing row-wise reductions of a pairwise distance matrix. Model Compression .................. Model compression in scikit-learn only concerns linear models for the moment. In this context it means that we want to control the model sparsity (i.e. the number of non-zero coordinates in the model vectors). It is generally a good idea to combine model sparsity with sparse input data representation. Here is sample code that illustrates the use of the ``sparsify()`` method:: clf = SGDRegressor(penalty='elasticnet', l1_ratio=0.25) clf.fit(X_train, y_train).sparsify() clf.predict(X_test) In this example we prefer the ``elasticnet`` penalty as it is often a good compromise between model compactness and prediction power. One can also further tune the ``l1_ratio`` parameter (in combination with the regularization strength ``alpha``) to control this tradeoff. A typical `benchmark `_ on synthetic data yields a >30% decrease in latency when both the model and input are sparse (with 0.000024 and 0.027400 non-zero coefficients ratio respectively). Your mileage may vary depending on the sparsity and size of your data and model. Furthermore, sparsifying can be very useful to reduce the memory usage of predictive models deployed on production servers. Model Reshaping ................ Model reshaping consists in selecting only a portion of the available features to fit a model. In other words, if a model discards features during the learning phase we can then strip those from the input. This has several benefits. Firstly it reduces memory (and therefore time) overhead of the model itself. It also allows to discard explicit feature selection components in a pipeline once we know which features to keep from a previous run. Finally, it can help reduce processing time and I/O usage upstream in the data access and feature extraction layers by not collecting and building features that are discarded by the model. For instance if the raw data come from a database, it can make it possible to write simpler and faster queries or reduce I/O usage by making the queries return lighter records. At the moment, reshaping needs to be performed manually in scikit-learn. In the case of sparse input (particularly in ``CSR`` format), it is generally sufficient to not generate the relevant features, leaving their columns empty. Links ...... - :ref:`scikit-learn developer performance documentation ` - `Scipy sparse matrix formats documentation `_ ================================================ FILE: doc/computing/parallelism.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True Parallelism, resource management, and configuration =================================================== .. _parallelism: Parallelism ----------- Some scikit-learn estimators and utilities can parallelize costly operations using multiple CPU cores, thanks to the following components: - via the `joblib `_ library. In this case the number of threads or processes can be controlled with the ``n_jobs`` parameter. - via OpenMP, used in C or Cython code. In addition, some of the numpy routines that are used internally by scikit-learn may also be parallelized if numpy is installed with specific numerical libraries such as MKL, OpenBLAS, or BLIS. We describe these 3 scenarios in the following subsections. Joblib-based parallelism ........................ When the underlying implementation uses joblib, the number of workers (threads or processes) that are spawned in parallel can be controlled via the ``n_jobs`` parameter. .. note:: Where (and how) parallelization happens in the estimators is currently poorly documented. Please help us by improving our docs and tackle `issue 14228 `_! Joblib is able to support both multi-processing and multi-threading. Whether joblib chooses to spawn a thread or a process depends on the **backend** that it's using. Scikit-learn generally relies on the ``loky`` backend, which is joblib's default backend. Loky is a multi-processing backend. When doing multi-processing, in order to avoid duplicating the memory in each process (which isn't reasonable with big datasets), joblib will create a `memmap `_ that all processes can share, when the data is bigger than 1MB. In some specific cases (when the code that is run in parallel releases the GIL), scikit-learn will indicate to ``joblib`` that a multi-threading backend is preferable. As a user, you may control the backend that joblib will use (regardless of what scikit-learn recommends) by using a context manager:: from joblib import parallel_backend with parallel_backend('threading', n_jobs=2): # Your scikit-learn code here Please refer to the `joblib's docs `_ for more details. In practice, whether parallelism is helpful at improving runtime depends on many factors. It is usually a good idea to experiment rather than assuming that increasing the number of workers is always a good thing. In some cases it can be highly detrimental to performance to run multiple copies of some estimators or functions in parallel (see oversubscription below). OpenMP-based parallelism ........................ OpenMP is used to parallelize code written in Cython or C, relying on multi-threading exclusively. By default (and unless joblib is trying to avoid oversubscription), the implementation will use as many threads as possible. You can control the exact number of threads that are used via the ``OMP_NUM_THREADS`` environment variable: .. prompt:: bash $ OMP_NUM_THREADS=4 python my_script.py Parallel Numpy routines from numerical libraries ................................................ Scikit-learn relies heavily on NumPy and SciPy, which internally call multi-threaded linear algebra routines implemented in libraries such as MKL, OpenBLAS or BLIS. The number of threads used by the OpenBLAS, MKL or BLIS libraries can be set via the ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, and ``BLIS_NUM_THREADS`` environment variables. Please note that scikit-learn has no direct control over these implementations. Scikit-learn solely relies on Numpy and Scipy. .. note:: At the time of writing (2019), NumPy and SciPy packages distributed on pypi.org (used by ``pip``) and on the conda-forge channel are linked with OpenBLAS, while conda packages shipped on the "defaults" channel from anaconda.org are linked by default with MKL. Oversubscription: spawning too many threads ........................................... It is generally recommended to avoid using significantly more processes or threads than the number of CPUs on a machine. Over-subscription happens when a program is running too many threads at the same time. Suppose you have a machine with 8 CPUs. Consider a case where you're running a :class:`~sklearn.model_selection.GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over a :class:`~sklearn.ensemble.HistGradientBoostingClassifier` (parallelized with OpenMP). Each instance of :class:`~sklearn.ensemble.HistGradientBoostingClassifier` will spawn 8 threads (since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which leads to oversubscription of physical CPU resources and to scheduling overhead. Oversubscription can arise in the exact same fashion with parallelized routines from MKL, OpenBLAS or BLIS that are nested in joblib calls. Starting from ``joblib >= 0.14``, when the ``loky`` backend is used (which is the default), joblib will tell its child **processes** to limit the number of threads they can use, so as to avoid oversubscription. In practice the heuristic that joblib uses is to tell the processes to use ``max_threads = n_cpus // n_jobs``, via their corresponding environment variable. Back to our example from above, since the joblib backend of :class:`~sklearn.model_selection.GridSearchCV` is ``loky``, each process will only be able to use 1 thread instead of 8, thus mitigating the oversubscription issue. Note that: - Manually setting one of the environment variables (``OMP_NUM_THREADS``, ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, or ``BLIS_NUM_THREADS``) will take precedence over what joblib tries to do. The total number of threads will be ``n_jobs * _NUM_THREADS``. Note that setting this limit will also impact your computations in the main process, which will only use ``_NUM_THREADS``. Joblib exposes a context manager for finer control over the number of threads in its workers (see joblib docs linked below). - Joblib is currently unable to avoid oversubscription in a multi-threading context. It can only do so with the ``loky`` backend (which spawns processes). You will find additional details about joblib mitigation of oversubscription in `joblib documentation `_. Configuration switches ----------------------- Python runtime .............. :func:`sklearn.set_config` controls the following behaviors: :assume_finite: used to skip validation, which enables faster computations but may lead to segmentation faults if the data contains NaNs. :working_memory: the optimal size of temporary arrays used by some algorithms. .. _environment_variable: Environment variables ...................... These environment variables should be set before importing scikit-learn. :SKLEARN_SITE_JOBLIB: When this environment variable is set to a non zero value, scikit-learn uses the site joblib rather than its vendored version. Consequently, joblib must be installed for scikit-learn to run. Note that using the site joblib is at your own risks: the versions of scikit-learn and joblib need to be compatible. Currently, joblib 0.11+ is supported. In addition, dumps from joblib.Memory might be incompatible, and you might loose some caches and have to redownload some datasets. .. deprecated:: 0.21 As of version 0.21 this parameter has no effect, vendored joblib was removed and site joblib is always used. :SKLEARN_ASSUME_FINITE: Sets the default value for the `assume_finite` argument of :func:`sklearn.set_config`. :SKLEARN_WORKING_MEMORY: Sets the default value for the `working_memory` argument of :func:`sklearn.set_config`. :SKLEARN_SEED: Sets the seed of the global random generator when running the tests, for reproducibility. :SKLEARN_SKIP_NETWORK_TESTS: When this environment variable is set to a non zero value, the tests that need network access are skipped. When this environment variable is not set then network tests are skipped. ================================================ FILE: doc/computing/scaling_strategies.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True .. _scaling_strategies: Strategies to scale computationally: bigger data ================================================= For some applications the amount of examples, features (or both) and/or the speed at which they need to be processed are challenging for traditional approaches. In these cases scikit-learn has a number of options you can consider to make your system scale. Scaling with instances using out-of-core learning -------------------------------------------------- Out-of-core (or "external memory") learning is a technique used to learn from data that cannot fit in a computer's main memory (RAM). Here is a sketch of a system designed to achieve this goal: 1. a way to stream instances 2. a way to extract features from instances 3. an incremental algorithm Streaming instances .................... Basically, 1. may be a reader that yields instances from files on a hard drive, a database, from a network stream etc. However, details on how to achieve this are beyond the scope of this documentation. Extracting features ................... \2. could be any relevant way to extract features among the different :ref:`feature extraction ` methods supported by scikit-learn. However, when working with data that needs vectorization and where the set of features or values is not known in advance one should take explicit care. A good example is text classification where unknown terms are likely to be found during training. It is possible to use a stateful vectorizer if making multiple passes over the data is reasonable from an application point of view. Otherwise, one can turn up the difficulty by using a stateless feature extractor. Currently the preferred way to do this is to use the so-called :ref:`hashing trick` as implemented by :class:`sklearn.feature_extraction.FeatureHasher` for datasets with categorical variables represented as list of Python dicts or :class:`sklearn.feature_extraction.text.HashingVectorizer` for text documents. Incremental learning ..................... Finally, for 3. we have a number of options inside scikit-learn. Although not all algorithms can learn incrementally (i.e. without seeing all the instances at once), all estimators implementing the ``partial_fit`` API are candidates. Actually, the ability to learn incrementally from a mini-batch of instances (sometimes called "online learning") is key to out-of-core learning as it guarantees that at any given time there will be only a small amount of instances in the main memory. Choosing a good size for the mini-batch that balances relevancy and memory footprint could involve some tuning [1]_. Here is a list of incremental estimators for different tasks: - Classification + :class:`sklearn.naive_bayes.MultinomialNB` + :class:`sklearn.naive_bayes.BernoulliNB` + :class:`sklearn.linear_model.Perceptron` + :class:`sklearn.linear_model.SGDClassifier` + :class:`sklearn.linear_model.PassiveAggressiveClassifier` + :class:`sklearn.neural_network.MLPClassifier` - Regression + :class:`sklearn.linear_model.SGDRegressor` + :class:`sklearn.linear_model.PassiveAggressiveRegressor` + :class:`sklearn.neural_network.MLPRegressor` - Clustering + :class:`sklearn.cluster.MiniBatchKMeans` + :class:`sklearn.cluster.Birch` - Decomposition / feature Extraction + :class:`sklearn.decomposition.MiniBatchDictionaryLearning` + :class:`sklearn.decomposition.IncrementalPCA` + :class:`sklearn.decomposition.LatentDirichletAllocation` - Preprocessing + :class:`sklearn.preprocessing.StandardScaler` + :class:`sklearn.preprocessing.MinMaxScaler` + :class:`sklearn.preprocessing.MaxAbsScaler` For classification, a somewhat important thing to note is that although a stateless feature extraction routine may be able to cope with new/unseen attributes, the incremental learner itself may be unable to cope with new/unseen targets classes. In this case you have to pass all the possible classes to the first ``partial_fit`` call using the ``classes=`` parameter. Another aspect to consider when choosing a proper algorithm is that not all of them put the same importance on each example over time. Namely, the ``Perceptron`` is still sensitive to badly labeled examples even after many examples whereas the ``SGD*`` and ``PassiveAggressive*`` families are more robust to this kind of artifacts. Conversely, the latter also tend to give less importance to remarkably different, yet properly labeled examples when they come late in the stream as their learning rate decreases over time. Examples .......... Finally, we have a full-fledged example of :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. It is aimed at providing a starting point for people wanting to build out-of-core learning systems and demonstrates most of the notions discussed above. Furthermore, it also shows the evolution of the performance of different algorithms with the number of processed examples. .. |accuracy_over_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_001.png :target: ../auto_examples/applications/plot_out_of_core_classification.html :scale: 80 .. centered:: |accuracy_over_time| Now looking at the computation time of the different parts, we see that the vectorization is much more expensive than learning itself. From the different algorithms, ``MultinomialNB`` is the most expensive, but its overhead can be mitigated by increasing the size of the mini-batches (exercise: change ``minibatch_size`` to 100 and 10000 in the program and compare). .. |computation_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_003.png :target: ../auto_examples/applications/plot_out_of_core_classification.html :scale: 80 .. centered:: |computation_time| Notes ...... .. [1] Depending on the algorithm the mini-batch size can influence results or not. SGD*, PassiveAggressive*, and discrete NaiveBayes are truly online and are not affected by batch size. Conversely, MiniBatchKMeans convergence rate is affected by the batch size. Also, its memory footprint can vary dramatically with batch size. ================================================ FILE: doc/computing.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True ============================ Computing with scikit-learn ============================ .. include:: includes/big_toc_css.rst .. toctree:: :maxdepth: 2 computing/scaling_strategies computing/computational_performance computing/parallelism ================================================ FILE: doc/conf.py ================================================ # -*- coding: utf-8 -*- # # scikit-learn documentation build configuration file, created by # sphinx-quickstart on Fri Jan 8 09:13:42 2010. # # This file is execfile()d with the current directory set to its containing # dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys import os import warnings import re from datetime import datetime from packaging.version import parse from pathlib import Path from io import StringIO # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. sys.path.insert(0, os.path.abspath("sphinxext")) from github_link import make_linkcode_resolve import sphinx_gallery import matplotlib as mpl # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", "numpydoc", "sphinx.ext.linkcode", "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.imgconverter", "sphinx_gallery.gen_gallery", "sphinx_issues", "add_toctree_functions", "sphinx-prompt", "sphinxext.opengraph", "doi_role", ] # Support for `plot::` directives in sphinx 3.2 requires matplotlib 3.1.0 or newer if parse(mpl.__version__) >= parse("3.1.0"): extensions.append("matplotlib.sphinxext.plot_directive") # Produce `plot::` directives for examples that contain `import matplotlib` or # `from matplotlib import`. numpydoc_use_plots = True # Options for the `::plot` directive: # https://matplotlib.org/stable/api/sphinxext_plot_directive_api.html plot_formats = ["png"] plot_include_source = True plot_html_show_formats = False plot_html_show_source_link = False # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_class_members_toctree = False # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set # (useful for viewing the doc offline) if os.environ.get("NO_MATHJAX"): extensions.append("sphinx.ext.imgmath") imgmath_image_format = "svg" mathjax_path = "" else: extensions.append("sphinx.ext.mathjax") mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js" autodoc_default_options = {"members": True, "inherited-members": True} # Add any paths that contain templates here, relative to this directory. templates_path = ["templates"] # generate autosummary even if no references autosummary_generate = True # The suffix of source filenames. source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8' # The main toctree document. main_doc = "contents" # General information about the project. project = "scikit-learn" copyright = f"2007 - {datetime.now().year}, scikit-learn developers (BSD License)" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. import sklearn parsed_version = parse(sklearn.__version__) version = ".".join(parsed_version.base_version.split(".")[:2]) # The full version, including alpha/beta/rc tags. # Removes post from release name if parsed_version.is_postrelease: release = parsed_version.base_version else: release = sklearn.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = ["_build", "templates", "includes", "themes"] # The reST default role (used for this markup: `text`) to use for all # documents. default_role = "literal" # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = False # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. html_theme = "scikit-learn-modern" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = {"google_analytics": True, "mathjax_path": mathjax_path} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = ["themes"] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. html_short_title = "scikit-learn" # The name of an image file (relative to this directory) to place at the top # of the sidebar. html_logo = "logos/scikit-learn-logo-small.png" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. html_favicon = "logos/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["images"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. html_additional_pages = {"index": "index.html"} # If false, no module index is generated. html_domain_indices = False # If false, no index is generated. html_use_index = False # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = "scikit-learndoc" # If true, the reST sources are included in the HTML build as _sources/name. html_copy_source = True # Adds variables into templates html_context = {} # finds latest release highlights and places it into HTML context for # index.html release_highlights_dir = Path("..") / "examples" / "release_highlights" # Finds the highlight with the latest version number latest_highlights = sorted(release_highlights_dir.glob("plot_release_highlights_*.py"))[ -1 ] latest_highlights = latest_highlights.with_suffix("").name html_context[ "release_highlights" ] = f"auto_examples/release_highlights/{latest_highlights}" # get version from highlight name assuming highlights have the form # plot_release_highlights_0_22_0 highlight_version = ".".join(latest_highlights.split("_")[-3:-1]) html_context["release_highlights_version"] = highlight_version # redirects dictionary maps from old links to new links redirects = { "documentation": "index", "auto_examples/feature_selection/plot_permutation_test_for_classification": ( "auto_examples/model_selection/plot_permutation_tests_for_classification" ), } html_context["redirects"] = redirects for old_link in redirects: html_additional_pages[old_link] = "redirects.html" # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. "preamble": r""" \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm} \usepackage{morefloats}\usepackage{enumitem} \setlistdepth{10} \let\oldhref\href \renewcommand{\href}[2]{\oldhref{#1}{\hbox{#2}}} """ } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ ( "contents", "user_guide.tex", "scikit-learn user guide", "scikit-learn developers", "manual", ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. latex_logo = "logos/scikit-learn-logo.png" # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. latex_domain_indices = False trim_doctests_flags = True # intersphinx configuration intersphinx_mapping = { "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), "numpy": ("https://numpy.org/doc/stable", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "matplotlib": ("https://matplotlib.org/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "joblib": ("https://joblib.readthedocs.io/en/latest/", None), "seaborn": ("https://seaborn.pydata.org/", None), } v = parse(release) if v.release is None: raise ValueError( "Ill-formed version: {!r}. Version should follow PEP440".format(version) ) if v.is_devrelease: binder_branch = "main" else: major, minor = v.release[:2] binder_branch = "{}.{}.X".format(major, minor) class SubSectionTitleOrder: """Sort example gallery by title of subsection. Assumes README.txt exists for all subsections and uses the subsection with dashes, '---', as the adornment. """ def __init__(self, src_dir): self.src_dir = src_dir self.regex = re.compile(r"^([\w ]+)\n-", re.MULTILINE) def __repr__(self): return "<%s>" % (self.__class__.__name__,) def __call__(self, directory): src_path = os.path.normpath(os.path.join(self.src_dir, directory)) # Forces Release Highlights to the top if os.path.basename(src_path) == "release_highlights": return "0" readme = os.path.join(src_path, "README.txt") try: with open(readme, "r") as f: content = f.read() except FileNotFoundError: return directory title_match = self.regex.search(content) if title_match is not None: return title_match.group(1) return directory sphinx_gallery_conf = { "doc_module": "sklearn", "backreferences_dir": os.path.join("modules", "generated"), "show_memory": False, "reference_url": {"sklearn": None}, "examples_dirs": ["../examples"], "gallery_dirs": ["auto_examples"], "subsection_order": SubSectionTitleOrder("../examples"), "binder": { "org": "scikit-learn", "repo": "scikit-learn", "binderhub_url": "https://mybinder.org", "branch": binder_branch, "dependencies": "./binder/requirements.txt", "use_jupyter_lab": True, }, # avoid generating too many cross links "inspect_global_variables": False, "remove_config_comments": True, } # The following dictionary contains the information used to create the # thumbnails for the front page of the scikit-learn home page. # key: first image in set # values: (number of plot in set, height of thumbnail) carousel_thumbs = {"sphx_glr_plot_classifier_comparison_001.png": 600} # enable experimental module so that experimental estimators can be # discovered properly by sphinx from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.experimental import enable_halving_search_cv # noqa def make_carousel_thumbs(app, exception): """produces the final resized carousel images""" if exception is not None: return print("Preparing carousel images") image_dir = os.path.join(app.builder.outdir, "_images") for glr_plot, max_width in carousel_thumbs.items(): image = os.path.join(image_dir, glr_plot) if os.path.exists(image): c_thumb = os.path.join(image_dir, glr_plot[:-4] + "_carousel.png") sphinx_gallery.gen_rst.scale_image(image, c_thumb, max_width, 190) def filter_search_index(app, exception): if exception is not None: return # searchindex only exist when generating html if app.builder.name != "html": return print("Removing methods from search index") searchindex_path = os.path.join(app.builder.outdir, "searchindex.js") with open(searchindex_path, "r") as f: searchindex_text = f.read() searchindex_text = re.sub(r"{__init__.+?}", "{}", searchindex_text) searchindex_text = re.sub(r"{__call__.+?}", "{}", searchindex_text) with open(searchindex_path, "w") as f: f.write(searchindex_text) def generate_min_dependency_table(app): """Generate min dependency table for docs.""" from sklearn._min_dependencies import dependent_packages # get length of header package_header_len = max(len(package) for package in dependent_packages) + 4 version_header_len = len("Minimum Version") + 4 tags_header_len = max(len(tags) for _, tags in dependent_packages.values()) + 4 output = StringIO() output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") dependency_title = "Dependency" version_title = "Minimum Version" tags_title = "Purpose" output.write( f"{dependency_title:<{package_header_len}} " f"{version_title:<{version_header_len}} " f"{tags_title}\n" ) output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") for package, (version, tags) in dependent_packages.items(): output.write( f"{package:<{package_header_len}} {version:<{version_header_len}} {tags}\n" ) output.write( " ".join( ["=" * package_header_len, "=" * version_header_len, "=" * tags_header_len] ) ) output.write("\n") output = output.getvalue() with (Path(".") / "min_dependency_table.rst").open("w") as f: f.write(output) def generate_min_dependency_substitutions(app): """Generate min dependency substitutions for docs.""" from sklearn._min_dependencies import dependent_packages output = StringIO() for package, (version, _) in dependent_packages.items(): package = package.capitalize() output.write(f".. |{package}MinVersion| replace:: {version}") output.write("\n") output = output.getvalue() with (Path(".") / "min_dependency_substitutions.rst").open("w") as f: f.write(output) # Config for sphinx_issues # we use the issues path for PRs since the issues URL will forward issues_github_path = "scikit-learn/scikit-learn" def setup(app): app.connect("builder-inited", generate_min_dependency_table) app.connect("builder-inited", generate_min_dependency_substitutions) # to hide/show the prompt in code examples: app.connect("build-finished", make_carousel_thumbs) app.connect("build-finished", filter_search_index) # The following is used by sphinx.ext.linkcode to provide links to github linkcode_resolve = make_linkcode_resolve( "sklearn", "https://github.com/scikit-learn/" "scikit-learn/blob/{revision}/" "{package}/{path}#L{lineno}", ) warnings.filterwarnings( "ignore", category=UserWarning, message=( "Matplotlib is currently using agg, which is a" " non-GUI backend, so cannot show the figure." ), ) # maps functions with a class name that is indistinguishable when case is # ignore to another filename autosummary_filename_map = { "sklearn.cluster.dbscan": "dbscan-function", "sklearn.covariance.oas": "oas-function", "sklearn.decomposition.fastica": "fastica-function", } # Config for sphinxext.opengraph ogp_site_url = "https://scikit-learn/stable/" ogp_image = "https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png" ogp_use_first_image = True ogp_site_name = "scikit-learn" ================================================ FILE: doc/conftest.py ================================================ import os from os.path import exists from os.path import join from os import environ import warnings from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest from sklearn.utils._testing import check_skip_network from sklearn.utils.fixes import parse_version from sklearn.datasets import get_data_home from sklearn.datasets._base import _pkl_filepath from sklearn.datasets._twenty_newsgroups import CACHE_NAME def setup_labeled_faces(): data_home = get_data_home() if not exists(join(data_home, "lfw_home")): raise SkipTest("Skipping dataset loading doctests") def setup_rcv1(): check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = join(get_data_home(), "RCV1") if not exists(rcv1_dir): raise SkipTest("Download RCV1 dataset to run this test.") def setup_twenty_newsgroups(): cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") def setup_working_with_text_data(): if IS_PYPY and os.environ.get("CI", None): raise SkipTest("Skipping too slow test with PyPy on CI") check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") def setup_loading_other_datasets(): try: import pandas # noqa except ImportError: raise SkipTest("Skipping loading_other_datasets.rst, pandas not installed") # checks SKLEARN_SKIP_NETWORK_TESTS to see if test should run run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0" if not run_network_tests: raise SkipTest( "Skipping loading_other_datasets.rst, tests can be " "enabled by setting SKLEARN_SKIP_NETWORK_TESTS=0" ) def setup_compose(): try: import pandas # noqa except ImportError: raise SkipTest("Skipping compose.rst, pandas not installed") def setup_impute(): try: import pandas # noqa except ImportError: raise SkipTest("Skipping impute.rst, pandas not installed") def setup_grid_search(): try: import pandas # noqa except ImportError: raise SkipTest("Skipping grid_search.rst, pandas not installed") def setup_preprocessing(): try: import pandas # noqa if parse_version(pandas.__version__) < parse_version("1.1.0"): raise SkipTest("Skipping preprocessing.rst, pandas version < 1.1.0") except ImportError: raise SkipTest("Skipping preprocessing.rst, pandas not installed") def setup_unsupervised_learning(): try: import skimage # noqa except ImportError: raise SkipTest("Skipping unsupervised_learning.rst, scikit-image not installed") # ignore deprecation warnings from scipy.misc.face warnings.filterwarnings( "ignore", "The binary mode of fromstring", DeprecationWarning ) def skip_if_matplotlib_not_installed(fname): try: import matplotlib # noqa except ImportError: basename = os.path.basename(fname) raise SkipTest(f"Skipping doctests for {basename}, matplotlib not installed") def pytest_runtest_setup(item): fname = item.fspath.strpath # normalise filename to use forward slashes on Windows for easier handling # later fname = fname.replace(os.sep, "/") is_index = fname.endswith("datasets/index.rst") if fname.endswith("datasets/labeled_faces.rst") or is_index: setup_labeled_faces() elif fname.endswith("datasets/rcv1.rst") or is_index: setup_rcv1() elif fname.endswith("datasets/twenty_newsgroups.rst") or is_index: setup_twenty_newsgroups() elif ( fname.endswith("tutorial/text_analytics/working_with_text_data.rst") or is_index ): setup_working_with_text_data() elif fname.endswith("modules/compose.rst") or is_index: setup_compose() elif IS_PYPY and fname.endswith("modules/feature_extraction.rst"): raise SkipTest("FeatureHasher is not compatible with PyPy") elif fname.endswith("datasets/loading_other_datasets.rst"): setup_loading_other_datasets() elif fname.endswith("modules/impute.rst"): setup_impute() elif fname.endswith("modules/grid_search.rst"): setup_grid_search() elif fname.endswith("modules/preprocessing.rst"): setup_preprocessing() elif fname.endswith("statistical_inference/unsupervised_learning.rst"): setup_unsupervised_learning() rst_files_requiring_matplotlib = [ "modules/partial_dependence.rst", "modules/tree.rst", "tutorial/statistical_inference/settings.rst", "tutorial/statistical_inference/supervised_learning.rst", ] for each in rst_files_requiring_matplotlib: if fname.endswith(each): skip_if_matplotlib_not_installed(fname) def pytest_configure(config): # Use matplotlib agg backend during the tests including doctests try: import matplotlib matplotlib.use("agg") except ImportError: pass ================================================ FILE: doc/contents.rst ================================================ .. include:: includes/big_toc_css.rst .. include:: tune_toc.rst .. Places global toc into the sidebar :globalsidebartoc: True ================= Table Of Contents ================= .. Define an order for the Table of Contents: .. toctree:: :maxdepth: 2 preface tutorial/index getting_started user_guide glossary auto_examples/index modules/classes developers/index ================================================ FILE: doc/data_transforms.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True .. include:: includes/big_toc_css.rst .. _data-transforms: Dataset transformations ----------------------- scikit-learn provides a library of transformers, which may clean (see :ref:`preprocessing`), reduce (see :ref:`data_reduction`), expand (see :ref:`kernel_approximation`) or generate (see :ref:`feature_extraction`) feature representations. Like other estimators, these are represented by classes with a ``fit`` method, which learns model parameters (e.g. mean and standard deviation for normalization) from a training set, and a ``transform`` method which applies this transformation model to unseen data. ``fit_transform`` may be more convenient and efficient for modelling and transforming the training data simultaneously. Combining such transformers, either in parallel or series is covered in :ref:`combining_estimators`. :ref:`metrics` covers transforming feature spaces into affinity matrices, while :ref:`preprocessing_targets` considers transformations of the target space (e.g. categorical labels) for use in scikit-learn. .. toctree:: :maxdepth: 2 modules/compose modules/feature_extraction modules/preprocessing modules/impute modules/unsupervised_reduction modules/random_projection modules/kernel_approximation modules/metrics modules/preprocessing_targets ================================================ FILE: doc/datasets/loading_other_datasets.rst ================================================ .. Places parent toc into the sidebar :parenttoc: True .. _loading_other_datasets: Loading other datasets ====================== .. currentmodule:: sklearn.datasets .. _sample_images: Sample images ------------- Scikit-learn also embeds a couple of sample JPEG images published under Creative Commons license by their authors. Those images can be useful to test algorithms and pipelines on 2D data. .. autosummary:: load_sample_images load_sample_image .. image:: ../auto_examples/cluster/images/sphx_glr_plot_color_quantization_001.png :target: ../auto_examples/cluster/plot_color_quantization.html :scale: 30 :align: right .. warning:: The default coding of images is based on the ``uint8`` dtype to spare memory. Often machine learning algorithms work best if the input is converted to a floating point representation first. Also, if you plan to use ``matplotlib.pyplpt.imshow``, don't forget to scale to the range 0 - 1 as done in the following example. .. topic:: Examples: * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py` .. _libsvm_loader: Datasets in svmlight / libsvm format ------------------------------------ scikit-learn includes utility functions for loading datasets in the svmlight / libsvm format. In this format, each line takes the form ``