Repository: vivek3141/ghostbuster Branch: master Commit: f60068d006b7 Files: 46 Total size: 18.5 MB Directory structure: gitextract_1_8dddfr/ ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── classify.py ├── generate.py ├── llama.py ├── model/ │ ├── features.txt │ ├── model │ ├── mu │ ├── sigma │ └── trigram_model.pkl ├── requirements.txt ├── results/ │ ├── best_features_custom.txt │ ├── best_features_essay.txt │ ├── best_features_four.txt │ ├── best_features_no_gpt.txt │ ├── best_features_old.txt │ ├── best_features_one.txt │ ├── best_features_only_ada.txt │ ├── best_features_reuter.txt │ ├── best_features_three.txt │ ├── best_features_two.txt │ ├── best_features_wp.txt │ ├── document_size.npy │ ├── ghostbuster.csv │ ├── other.csv │ ├── perturb.csv │ ├── perturb_char.npy │ ├── perturb_sent.npy │ ├── roberta.csv │ └── training_size.npy ├── roberta/ │ ├── roberta_results.csv │ ├── run_roberta.py │ └── train.py ├── run.py ├── setup.py ├── train.py └── utils/ ├── __init__.py ├── featurize.py ├── generate.py ├── load.py ├── n_gram.py ├── score.py ├── symbolic.py └── write_logprobs.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.txt linguist-detectable=false data/* lingusit-detectable=false ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ openai.config anthropic.config bard.config *.wp_source *.wp_target gptzero.ipynb symbolic_data symbolic_data_four input.txt roberta/models results.csv question.csv answer.csv playground.ipynb t_data data/wp/raw data/reuter/raw data/essay/raw roberta/models_old roberta/models_bak roberta/models* symbolic_data* t_data* eve.csv gpt_zero.ipynb detect_gpt.ipynb zero_shot.ipynb ghostbuster_stats.ipynb *.ipynb temp data rebuttal.py ================================================ FILE: LICENSE ================================================ Creative Commons Legal Code Attribution 3.0 Unported CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. License THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. 1. Definitions a. "Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License. b. "Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined above) for the purposes of this License. c. "Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership. d. "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License. e. "Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast. f. "Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work. g. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation. h. "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images. i. "Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium. 2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws. 3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below: a. to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections; b. to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified."; c. to Distribute and Publicly Perform the Work including as incorporated in Collections; and, d. to Distribute and Publicly Perform Adaptations. e. For the avoidance of doubt: i. Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; ii. Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and, iii. Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License. The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved. 4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions: a. You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(b), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(b), as requested. b. If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Section 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4 (b) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties. c. Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise. 5. Representations, Warranties and Disclaimer UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 7. Termination a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License. b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above. 8. Miscellaneous a. Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License. b. Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License. c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent. e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You. f. The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law. Creative Commons Notice Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor. Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of this License. Creative Commons may be contacted at https://creativecommons.org/. ================================================ FILE: README.md ================================================ # Ghostbuster: Detecting Text Ghostwritten by Large Language Models [paper] [demo] [data] We introduce Ghostbuster, a state-of-the-art system for detecting AI-generated text. Our method works by passing documents through a series of weaker language models, running a structured search over possible combinations of their features, and then training a classifier on the selected features to predict whether documents are AI-generated.

Crucially, Ghostbuster does not require access to token probabilities from the target model, making it useful for detecting text generated by black-box models or unknown model versions. We compare Ghostbuster to a variety of existing detectors, including DetectGPT and GPTZero, as well as a new RoBERTa baseline. Ghostbuster achieves 99.0 F1 when evaluated across domains, which is 5.9 F1 higher than the best preexisting model. It also outperforms all previous approaches in generalization across writing domains (+7.5 F1), prompting strategies (+2.1 F1), and language models (+4.4 F1). ## Datasets In conjunction with our model, we release three new datasets of human- and AI-generated text as detection benchmarks in the domains of student essays, creative writing, and news articles. These can be found under the `data` folder. Each dataset also contains a `logprobs` folder, which consists of files of the form `X-ada.txt` and `X-davinci.txt`, corresponding to token-wise logprobs when running the document `X.txt` under ada and davinci. We note that any usages of our data prior to Nov 14 2023 would likely be using our old dataset. In order to access this, please revert to the commit hash `604d85c`. # Installation Each of our files are pickled with `python3.10`, so we highly reccomend creating a new conda environment as follows: ``` conda create -n ghostbuster python=3.10 conda activate ghostbuster ``` Then, clone the reponsitory: ``` git clone git@github.com:vivek3141/ghostbuster.git cd ghostbuster ``` Lastly, install the dependencies and the package: ``` pip install -r requirements.txt pip install -e . ``` You may also need to open a `python` shell to install the following nltk `brown` model: ```python import nltk nltk.download('brown') ``` # Usage In order to run a standalone text through Ghostbuster, we provide a `classify.py` file with the following usage: ``` python3 classify.py --file INPUT_FILE_HERE --openai_key OPENAI_KEY ``` To run the experiment files, create a file called `openai.config` in the main directory with the following template: ```javascript { "organization": ORGANIZATION, "api_key": API_KEY } ``` Then, you must generate the cached symbolic data file. This consists of a feature vector for every single feature found through our "symbolic search" method. Running these commands will create binary files in the root directory: ``` python train.py --generate_symbolic_data_four python train.py --generate_symbolic_data_eval ``` These commands should take a couple hours to run! Then, you can run any of the experiments listed in the `run.py` file. ## Disclaimer Ghostbuster’s training data, which consists of news, student essay, and creative writing data, is not representative of all writing styles or topics and contains predominantly British and American English text. If you wish to apply Ghostbuster to real-world cases of potential off-limits usage of text generation, such as identifying ChatGPT-written student essays, be wary that incorrect predictions by Ghostbuster are particularly likely in the following cases: No AI-generated text detector is 100% accurate; we strongly discourage incorporation of Ghostbuster into any systems that automatically penalize students or other writers for alleged usage of text generation without human intervention. Privacy: Please be aware that all inputs to Ghostbuster are sent to the OpenAI API, and we also save the inputs for internal testing purposes. Though we will not distribute the data publicly, we cannot guarantee the privacy of any inputs to Ghostbuster. ================================================ FILE: classify.py ================================================ import numpy as np import dill as pickle import tiktoken import openai import argparse from sklearn.linear_model import LogisticRegression from utils.featurize import normalize, t_featurize_logprobs, score_ngram from utils.symbolic import train_trigram, get_words, vec_functions, scalar_functions parser = argparse.ArgumentParser() parser.add_argument("--file", type=str, default="input.txt") parser.add_argument("--openai_key", type=str, default="") args = parser.parse_args() if args.openai_key != "": openai.api_key = args.openai_key file = args.file MAX_TOKENS = 2047 best_features = open("model/features.txt").read().strip().split("\n") # Load davinci tokenizer enc = tiktoken.encoding_for_model("davinci") # Load model model = pickle.load(open("model/model", "rb")) mu = pickle.load(open("model/mu", "rb")) sigma = pickle.load(open("model/sigma", "rb")) # Load data and featurize with open(file) as f: doc = f.read().strip() # Strip data to first MAX_TOKENS tokens tokens = enc.encode(doc)[:MAX_TOKENS] doc = enc.decode(tokens).strip() print(f"Input: {doc}") # Train trigram print("Loading Trigram...") trigram_model = train_trigram() trigram = np.array(score_ngram(doc, trigram_model, enc.encode, n=3, strip_first=False)) unigram = np.array(score_ngram(doc, trigram_model.base, enc.encode, n=1, strip_first=False)) response = openai.Completion.create( model="ada", prompt="<|endoftext|>" + doc, max_tokens=0, echo=True, logprobs=1, ) ada = np.array(list(map(lambda x: np.exp(x), response["choices"][0]["logprobs"]["token_logprobs"][1:]))) response = openai.Completion.create( model="davinci", prompt="<|endoftext|>" + doc, max_tokens=0, echo=True, logprobs=1, ) davinci = np.array(list(map(lambda x: np.exp(x), response["choices"][0]["logprobs"]["token_logprobs"][1:]))) subwords = response["choices"][0]["logprobs"]["tokens"][1:] gpt2_map = {"\n": "Ċ", "\t": "ĉ", " ": "Ġ"} for i in range(len(subwords)): for k, v in gpt2_map.items(): subwords[i] = subwords[i].replace(k, v) t_features = t_featurize_logprobs(davinci, ada, subwords) vector_map = { "davinci-logprobs": davinci, "ada-logprobs": ada, "trigram-logprobs": trigram, "unigram-logprobs": unigram } exp_features = [] for exp in best_features: exp_tokens = get_words(exp) curr = vector_map[exp_tokens[0]] for i in range(1, len(exp_tokens)): if exp_tokens[i] in vec_functions: next_vec = vector_map[exp_tokens[i+1]] curr = vec_functions[exp_tokens[i]](curr, next_vec) elif exp_tokens[i] in scalar_functions: exp_features.append(scalar_functions[exp_tokens[i]](curr)) break data = (np.array(t_features + exp_features) - mu) / sigma preds = model.predict_proba(data.reshape(-1, 1).T)[:, 1] print(f"Prediction: {preds}") ================================================ FILE: generate.py ================================================ import argparse import openai import re import tqdm import os import math import nltk import numpy as np import string import torch from nltk.corpus import wordnet from datasets import load_dataset from nltk.tokenize.treebank import TreebankWordDetokenizer from tenacity import ( retry, stop_after_attempt, wait_random_exponential, ) from transformers import PegasusForConditionalGeneration, PegasusTokenizer from transformers import AutoTokenizer, AutoModelForCausalLM from utils.generate import generate_documents from utils.write_logprobs import write_logprobs, write_llama_logprobs from utils.symbolic import convert_file_to_logprob_file from utils.load import Dataset, get_generate_dataset nltk.download("wordnet") nltk.download("omw-1.4") llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device) datasets = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] generate_dataset_fn = get_generate_dataset(*datasets) prompt_types = ["gpt", "gpt_prompt1", "gpt_prompt2", "gpt_writing", "gpt_semantic"] html_replacements = [ ("&", "&"), ("<", "<"), (">", ">"), (""", '"'), ("'", "'"), ] perturb_char_names = [ "char_basic", "char_space", "char_cap", "word_adj", "word_syn", ] perturb_char_sizes = [0, 1, 2, 3, 4, 5, 10, 20, 50, 100, 200] perturb_sent_names = ["sent_adj", "sent_paraph", "para_adj", "para_paraph"] perturb_sent_sizes = list(range(11)) def closest_synonym(word): synonyms = wordnet.synsets(word) if not synonyms: return None # Return None if there are no synonyms closest_synset = synonyms[0] # Assume the first synset is the closest for synset in synonyms[1:]: # Update closest_synset if we find a synset with more lemmas (synonyms) if len(synset.lemmas()) > len(closest_synset.lemmas()): closest_synset = synset # Return the name of the lemma from the closest synset # that is not the same as the input word for lemma in closest_synset.lemmas(): if lemma.name() != word: return lemma.name() return None def html_replace(text): for replacement in html_replacements: text = text.replace(replacement[0], replacement[1]) return text @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def openai_backoff(**kwargs): return openai.ChatCompletion.create(**kwargs) def round_to_100(n): return int(round(n / 100.0)) * 100 def get_wp_prompts(words, prompt): return [ f'Write a story in {words} words to the prompt "{prompt}."', f'You are an author, who is writing a story in response to the prompt "{prompt}." What would you write in {words} words?', f'Hi! I\'m trying to write a {words}-word story on the following prompt: "{prompt}." Could you please draft something for me?', f'Please help me write a short story in response to the prompt "{prompt}."', f'Write a {words}-word story in the style of a beginner writer in response to the prompt "{prompt}."', f'Write a story with very short sentences in {words} words to the prompt "{prompt}."', ] def get_reuter_prompts(words, headline): return [ f'Write a news article in {words} words based on the headline "{headline}."', f'You are a news reporter, who is writing an article with the headline "{headline}." What would you write in {words} words?', f'Hi! I\'m trying to write a {words}-word news article based on the following headline: "{headline}." Could you please draft something for me?', f'Please help me write a New York Times article for the headline "{headline}."', f'Write a {words}-word news article in the style of a New York Times article based on the headline "{headline}."', f'Write a news article with very short sentences in {words} words based on the headline "{headline}."', ] def get_essay_prompts(words, prompts): return [ f'Write an essay in {words} words to the prompt "{prompt}."', f'You are a student, who is writing an essay in response to the prompt "{prompt}." What would you write in {words} words?', f'Hi! I\'m trying to write a {words}-word essay based on the following prompt: "{prompt}." Could you please draft something for me?', f'Please help me write an essay in response to the prompt "{prompt}."', f"Write a {words}-word essay in the style of a high-school student in response to the following prompt: {prompt}.", f'Write an essay with very short sentences in {words} words to the prompt "{prompt}."', ] def generate_logprobs(generate_dataset_fn, llama_7b_model=None, llama_13b_model=None): files = generate_dataset_fn(lambda f: f) for file in tqdm.tqdm(files): if "logprobs" in file: continue base_path = os.path.dirname(file) + "/logprobs" if not os.path.exists(base_path): os.mkdir(base_path) with open(file, "r") as f: doc = f.read().strip() davinci_file = convert_file_to_logprob_file(file, "davinci") if not os.path.exists(davinci_file): write_logprobs(doc, davinci_file, "davinci") ada_file = convert_file_to_logprob_file(file, "ada") if not os.path.exists(ada_file): write_logprobs(doc, ada_file, "ada") llama_7b_file = convert_file_to_logprob_file(file, "llama-7b") if llama_7b_model and not os.path.exists(llama_7b_file): write_llama_logprobs(doc, llama_7b_file, llama_7b_model) llama_13b_file = convert_file_to_logprob_file(file, "llama-13b") if llama_13b_model and not os.path.exists(llama_13b_file): write_llama_logprobs(doc, llama_13b_file, llama_13b_model) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0) parser.add_argument("--wp_prompts", action="store_true") parser.add_argument("--wp_human", action="store_true") parser.add_argument("--wp_gpt", action="store_true") parser.add_argument("--reuter_human", action="store_true") parser.add_argument("--reuter_gpt", action="store_true") parser.add_argument("--essay_prompts", action="store_true") parser.add_argument("--essay_human", action="store_true") parser.add_argument("--essay_gpt", action="store_true") parser.add_argument("--logprobs", action="store_true") parser.add_argument("--logprob_other", action="store_true") parser.add_argument("--logprob_llama", action="store_true") parser.add_argument("--gen_perturb_char", action="store_true") parser.add_argument("--logprob_perturb_char", action="store_true") parser.add_argument("--gen_perturb_sent", action="store_true") parser.add_argument("--logprob_perturb_sent", action="store_true") args = parser.parse_args() if args.wp_prompts: def format_prompt(p): p = re.sub(r"\[.*\]", "", p) p = re.sub(r"\\n", " ", p) p = re.sub(r"\\t", " ", p) p = re.sub(r"\s+", " ", p) return p.strip() with open("data/wp/raw/train.wp_source", "r") as f: num_lines_read = 0 print("Generating and writing WP prompts...") pbar = tqdm.tqdm(total=1000) for prompt in f: if num_lines_read >= 1000: break input_prompt = format_prompt(prompt) response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": f"Remove all the formatting in this prompt:\n\n{input_prompt}", } ], ) reply = response["choices"][0]["message"]["content"].strip() with open(f"data/wp/prompts/{num_lines_read + 1}.txt", "w") as f: f.write(reply) num_lines_read += 1 pbar.update(1) pbar.close() if args.wp_human: print("Formatting Human WP documents...") with open("data/wp/raw/train.wp_target", "r") as f: num_lines_read = 0 pbar = tqdm.tqdm(total=1000) for doc in f: if num_lines_read >= 1000: break doc = doc.strip() tokens = doc.split(" ") replace = [ ["", "\n"], ] for r in replace: tokens = [t.replace(r[0], r[1]) for t in tokens] detokenizer = TreebankWordDetokenizer() formatted_doc = detokenizer.detokenize(tokens) formatted_doc = "\n".join( [i.strip() for i in formatted_doc.split("\n")] ) formatted_doc = formatted_doc.replace("\n\n", "\n") formatted_doc = formatted_doc.replace("\n\n", "\n") formatted_doc = formatted_doc.replace(" .", ".") formatted_doc = formatted_doc.replace(" ’ ", "'") formatted_doc = formatted_doc.replace(" ”", '"') formatted_doc = formatted_doc.replace("“ ", '"') formatted_doc = html_replace(formatted_doc) with open(f"data/wp/human/{num_lines_read + 1}.txt", "w") as f: f.write(formatted_doc) num_lines_read += 1 pbar.update(1) pbar.close() if args.wp_gpt: print("Generating GPT WP documents...") for idx in tqdm.tqdm(range(1, 1001)): with open(f"data/wp/prompts/{idx}.txt", "r") as f: prompt = f.read().strip() with open(f"data/wp/human/{idx}.txt", "r") as f: words = round_to_100(len(f.read().split(" "))) prompts = get_wp_prompts(words, prompt) for type, prompt in zip(prompt_types, prompts): if os.path.exists(f"data/wp/{type}/{idx}.txt"): continue response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": prompt, } ], ) reply = response["choices"][0]["message"]["content"].strip() reply = reply.replace("\n\n", "\n") with open(f"data/wp/{type}/{idx}.txt", "w") as f: f.write(reply) if args.reuter_human: reuter_replace = ["--", "202-898-8312", "((", "($1=", "(A$", "Reuters Chicago"] authors = os.listdir("data/reuter/raw/C50train") print("Formatting Human Reuters documents...") for author in tqdm.tqdm(authors): if not os.path.exists(f"data/reuter/human/{author}"): os.makedirs(f"data/reuter/human/{author}") files = [ f"data/reuter/raw/C50train/{author}/{i}" for i in os.listdir(f"data/reuter/raw/C50train/{author}") ] + [ f"data/reuter/raw/C50test/{author}/{i}" for i in os.listdir(f"data/reuter/raw/C50test/{author}") ] for n, file in enumerate(files[:20]): with open(file, "r") as f: doc = f.read().strip() doc = doc.replace("\n\n", "\n") lines = doc.split("\n") if any([i in lines[-1] for i in reuter_replace]): lines = lines[:-1] doc = "\n".join(lines) doc = html_replace(doc) with open(f"data/reuter/human/{author}/{n+1}.txt", "w") as f: f.write(doc.strip()) if args.reuter_gpt: print("Generating GPT Reuters documents...") authors = os.listdir("data/reuter/human") for author in tqdm.tqdm(authors): for idx in range(1, 21): with open(f"data/reuter/human/{author}/{idx}.txt", "r") as f: words = round_to_100(len(f.read().split(" "))) with open(f"data/reuter/gpt/{author}/headlines/{idx}.txt", "r") as f: headline = f.read().strip() prompts = get_reuter_prompts(words, headline) for type, prompt in zip(prompt_types, prompts): if not os.path.exists(f"data/reuter/{type}/{author}"): os.makedirs(f"data/reuter/{type}/{author}") if os.path.exists(f"data/reuter/{type}/{author}/{idx}.txt"): continue response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": prompt, } ], ) reply = response["choices"][0]["message"]["content"].strip() reply = reply.replace("\n\n", "\n") lines = reply.split("\n") if any([i in lines[0].lower() for i in ["sure", "certainly"]]): reply = "\n".join(lines[1:]) lines = reply.split("\n") if any([i in lines[0].lower() for i in ["title"]]): reply = "\n".join(lines[1:]) with open(f"data/reuter/{type}/{author}/{idx}.txt", "w") as f: f.write(reply) if args.essay_human or args.essay_gpt: essay_dataset = load_dataset("qwedsacf/ivypanda-essays") if args.essay_human: print("Formatting Human Essay documents...") num_documents, idx = 0, 0 pbar = tqdm.tqdm(total=1000) while num_documents < 1000: essay = essay_dataset["train"][idx] essay = essay["TEXT"].strip() essay = essay[essay.index("\n") + 1 :] idx += 1 if "table of contents" in essay.lower(): continue essay = essay.replace("\n\n", "\n") lines = essay.split("\n") doc = [] for line in lines: if any( [ i in line.lower() for i in [ "references", "reference", "work cited", "works cited", "bibliography", ] ] ): break doc.append(line) doc = "\n".join(doc) with open(f"data/essay/human/{num_documents + 1}.txt", "w") as f: f.write(doc.strip()) num_documents += 1 pbar.update(1) if args.essay_prompts: print("Generating Essay prompts...") for idx in tqdm.tqdm(range(1, 1001)): with open(f"data/essay/human/{idx}.txt", "r") as f: doc = f.read().strip() response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": f"Given the following essay, write a prompt for it:\n\n{' '.join(doc.split(' ')[:500])}", } ], ) reply = response["choices"][0]["message"]["content"].strip() reply = reply.replace("Prompt: ", "").strip() with open(f"data/essay/prompts/{idx}.txt", "w") as f: f.write(reply) if args.essay_gpt: print("Generating GPT Essay documents...") for type in prompt_types: if not os.path.exists(f"data/essay/{type}"): os.makedirs(f"data/essay/{type}") for idx in tqdm.tqdm(range(1, 1001)): with open(f"data/essay/prompts/{idx}.txt", "r") as f: prompt = f.read().strip() with open(f"data/essay/human/{idx}.txt", "r") as f: words = round_to_100(len(f.read().split(" "))) prompts = get_essay_prompts(words, prompt) for type, prompt in zip(prompt_types, prompts): if os.path.exists(f"data/essay/{type}/{idx}.txt"): continue response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": prompt, } ], ) reply = response["choices"][0]["message"]["content"].strip() reply = reply.replace("\n\n", "\n") lines = reply.split("\n") if any([i in lines[0].lower() for i in ["sure", "certainly"]]): reply = "\n".join(lines[1:]) lines = reply.split("\n") if any([i in lines[0].lower() for i in ["title"]]): reply = "\n".join(lines[1:]) with open(f"data/essay/{type}/{idx}.txt", "w") as f: f.write(reply) if args.logprobs: datasets = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] generate_logprobs(get_generate_dataset(*datasets)) if args.logprob_other: other_datasets = [ Dataset("normal", "data/other/ets"), Dataset("normal", "data/other/lang8"), Dataset("normal", "data/other/pelic"), Dataset("normal", "data/other/gptzero/gpt"), Dataset("normal", "data/other/gptzero/human"), Dataset("normal", "data/other/toefl91"), Dataset("normal", "data/other/undetectable"), ] generate_logprobs(get_generate_dataset(*other_datasets)) if args.logprob_llama: print("Loading LLAMA...") # llama_7b = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf").to( # device # ) llama_13b = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-13B-AWQ").to( device ) print("LLAMA Loaded") datasets = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] generate_logprobs( get_generate_dataset(*datasets), # llama_7b_model=llama_7b, llama_13b_model=llama_13b, ) if args.gen_perturb_char: def perturb_char_basic(doc, n=1): if len(doc) < 2: return doc for _ in range(n): peturb_type = np.random.choice(["swap", "delete", "insert"]) if peturb_type == "swap": idx = np.random.randint(len(doc) - 1) doc = doc[:idx] + doc[idx + 1] + doc[idx] + doc[idx + 2 :] elif peturb_type == "delete" and len(doc) > 1: idx = np.random.randint(len(doc)) doc = doc[:idx] + doc[idx + 1 :] elif peturb_type == "insert": idx = np.random.randint(len(doc)) doc = ( doc[:idx] + np.random.choice(list(string.ascii_letters)) + doc[idx:] ) return doc def perturb_char_space(doc, n=1): if len(doc) < 2: return doc for _ in range(n): perturb_type = np.random.choice(["insert", "delete"]) if perturb_type == "insert": idx = np.random.randint(len(doc)) doc = doc[:idx] + " " + doc[idx:] elif perturb_type == "delete": space_indices = [ idx for idx, c in enumerate(doc) if c == " " or c == "\n" ] if len(space_indices) > 0: idx = np.random.choice(space_indices) doc = doc[:idx] + doc[idx + 1 :] return doc def perturb_char_cap(doc, n=1): if len(doc) < 2: return doc for _ in range(n): idx = np.random.randint(len(doc)) if doc[idx].isalpha(): if doc[idx].isupper(): doc = doc[:idx] + doc[idx].lower() + doc[idx + 1 :] else: doc = doc[:idx] + doc[idx].upper() + doc[idx + 1 :] return doc def perturb_word_adj(doc, n=1): words = doc.split(" ") if len(words) < 2: return doc for _ in range(n): idx = np.random.randint(len(words) - 1) words[idx], words[idx + 1] = words[idx + 1], words[idx] doc = " ".join(words) return doc def perturb_word_syn(doc, n=1): words = doc.split(" ") if len(words) < 2: return doc for _ in range(n): idx = np.random.randint(len(words)) word = words[idx] synonym = closest_synonym(word) if synonym: words[idx] = synonym doc = " ".join(words) return doc perturb_char_word_fns = { "char_basic": perturb_char_basic, "char_space": perturb_char_space, "char_cap": perturb_char_cap, "word_adj": perturb_word_adj, "word_syn": perturb_word_syn, } if not os.path.exists("data/perturb"): os.makedirs("data/perturb") np.random.seed(args.seed) # Construct the test/train split. Seed of 0 ensures seriality across # all files performing the same split. indices = np.arange(6000) np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] print("Train/Test Split:", train, test) files = generate_dataset_fn(lambda f: f, verbose=False) indices = np.arange(len(test)) np.random.shuffle(indices) indices = indices[:200] labels = [] for file in files[test][indices]: if "human" in file and "gpt" not in file: labels.append(0) elif "gpt" in file and "human" not in file: labels.append(1) else: raise ValueError("Invalid file name") with open("data/perturb/labels.txt", "w") as f: f.write("\n".join([str(i) for i in labels])) # Generate the perturbed documents num_perturb = [0, 1, 2, 3, 4, 5, 10, 20, 50, 100, 200] for n in tqdm.tqdm(num_perturb): for perturb_type, func in perturb_char_word_fns.items(): if not os.path.exists(f"data/perturb/{perturb_type}/{n}"): os.makedirs(f"data/perturb/{perturb_type}/{n}") for idx, file in enumerate(files[test][indices]): with open(file, "r") as f: doc = f.read().strip() perturb_doc = func(doc, n=n) with open(f"data/perturb/{perturb_type}/{n}/{idx}.txt", "w") as f: f.write(perturb_doc) if args.logprob_perturb_char: perturb_datasets = [ Dataset("normal", f"data/perturb/{perturb_type}/{n}") for perturb_type in perturb_char_names for n in perturb_char_sizes ] generate_logprobs(get_generate_dataset(*perturb_datasets)) if args.gen_perturb_sent: if torch.cuda.is_available(): device = "cuda" print("Using GPU") else: device = "cpu" print("Using CPU") tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase") model = PegasusForConditionalGeneration.from_pretrained( "tuner007/pegasus_paraphrase" ).to(device) def paraphrase(text): batch = tokenizer( [text], truncation=True, padding="longest", return_tensors="pt" ).to(device) translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text[0] def perturb_sent_adj(doc, n=1): """ Randomly swap n pairs of adjacent sentences in the document """ doc = nltk.sent_tokenize(doc) if len(doc) < 2: return (" ".join(doc)).strip() for _ in range(n): idx = np.random.randint(len(doc) - 1) doc[idx], doc[idx + 1] = doc[idx + 1], doc[idx] return (" ".join(doc)).strip() def perturb_sent_paraph(doc, n=1): """ Randomly paraphrase n sentences in the document """ doc = nltk.sent_tokenize(doc) if len(doc) < 1: return (" ".join(doc)).strip() for _ in range(n): idx = np.random.randint(len(doc)) doc[idx] = paraphrase(doc[idx]) return (" ".join(doc)).strip() def perturb_para_adj(doc, n=1): """ Randomly swap n pairs of adjacent paragraphs in the document """ doc = doc.split("\n") if len(doc) < 2: return "\n".join(doc) for _ in range(n): idx = np.random.randint(len(doc) - 1) doc[idx], doc[idx + 1] = doc[idx + 1], doc[idx] return "\n".join(doc) def perturb_para_paraph(doc, n=1): """ Randomly paraphrase n paragraphs in the document """ doc = doc.split("\n") if len(doc) < 1: return "\n".join(doc) for _ in range(n): idx = np.random.randint(len(doc)) doc[idx] = paraphrase(doc[idx]) return "\n".join(doc) perturb_sent_fns = { "sent_adj": perturb_sent_adj, "sent_paraph": perturb_sent_paraph, "para_adj": perturb_para_adj, "para_paraph": perturb_para_paraph, } if not os.path.exists("data/perturb"): os.makedirs("data/perturb") np.random.seed(args.seed) # Construct the test/train split. Seed of 0 ensures seriality across # all files performing the same split. indices = np.arange(6000) np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] print("Train/Test Split:", train, test) files = generate_dataset_fn(lambda f: f, verbose=False) indices = np.arange(len(test)) np.random.shuffle(indices) indices = indices[:200] labels = [] for file in files[test][indices]: if "human" in file and "gpt" not in file: labels.append(0) elif "gpt" in file and "human" not in file: labels.append(1) else: raise ValueError("Invalid file name") with open("data/perturb/labels.txt", "w") as f: f.write("\n".join([str(i) for i in labels])) # Generate the perturbed documents num_perturb = list(range(11)) for n in tqdm.tqdm(num_perturb): for perturb_type, func in perturb_sent_fns.items(): if not os.path.exists(f"data/perturb/{perturb_type}/{n}"): os.makedirs(f"data/perturb/{perturb_type}/{n}") for idx, file in enumerate(files[test][indices]): with open(file, "r") as f: doc = f.read().strip() perturb_doc = func(doc, n=n) with open(f"data/perturb/{perturb_type}/{n}/{idx}.txt", "w") as f: f.write(perturb_doc) if args.logprob_perturb_sent: perturb_datasets = [ Dataset("normal", f"data/perturb/{perturb_type}/{n}") for perturb_type in perturb_sent_names for n in perturb_sent_sizes ] generate_logprobs(get_generate_dataset(*perturb_datasets)) ================================================ FILE: llama.py ================================================ import os import argparse from utils.featurize import convert_file_to_logprob_file, get_logprobs from utils.load import Dataset, get_generate_dataset from utils.n_gram import TrigramBackoff from utils.featurize import select_features, normalize from utils.symbolic import vec_functions, scalar_functions from transformers import AutoTokenizer from collections import defaultdict import math import tqdm import numpy as np import dill as pickle from nltk.util import ngrams from nltk.corpus import brown from nltk.tokenize import word_tokenize from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score datasets = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] best_features = [ "trigram-logprobs v-add unigram-logprobs v-> llama-logprobs s-var", "trigram-logprobs v-div unigram-logprobs v-div trigram-logprobs s-avg-top-25", "unigram-logprobs v-mul llama-logprobs s-avg", "trigram-logprobs v-mul unigram-logprobs v-div trigram-logprobs s-avg", "trigram-logprobs v-< unigram-logprobs v-mul llama-logprobs s-avg-top-25", "trigram-logprobs v-mul unigram-logprobs v-sub llama-logprobs s-min", "trigram-logprobs v-mul unigram-logprobs s-avg", "trigram-logprobs v-< unigram-logprobs v-sub llama-logprobs s-avg", "trigram-logprobs v-> unigram-logprobs v-add llama-logprobs s-avg", "trigram-logprobs v-div llama-logprobs v-div trigram-logprobs s-min", ] models = ["gpt"] domains = ["wp", "reuter", "essay"] eval_domains = ["claude", "gpt_prompt1", "gpt_prompt2", "gpt_writing", "gpt_semantic"] vectors = ["llama-logprobs", "unigram-logprobs", "trigram-logprobs"] parser = argparse.ArgumentParser() parser.add_argument("--feature_select", action="store_true") parser.add_argument("--classify", action="store_true") args = parser.parse_args() tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") sentences = brown.sents() tokenized_corpus = [] for sentence in tqdm.tqdm(sentences): tokens = tokenizer(" ".join(sentence))["input_ids"] tokenized_corpus += tokens trigram = TrigramBackoff(tokenized_corpus) vec_combinations = defaultdict(list) for vec1 in range(len(vectors)): for vec2 in range(vec1): for func in vec_functions: if func != "v-div": vec_combinations[vectors[vec1]].append(f"{func} {vectors[vec2]}") for vec1 in vectors: for vec2 in vectors: if vec1 != vec2: vec_combinations[vec1].append(f"v-div {vec2}") def get_words(exp): """ Splits up expression into words, to be individually processed """ return exp.split(" ") def backtrack_functions( max_depth=2, ): """ Backtrack all possible features. """ def helper(prev, depth): if depth >= max_depth: return [] all_funcs = [] prev_word = get_words(prev)[-1] for func in scalar_functions: all_funcs.append(f"{prev} {func}") for comb in vec_combinations[prev_word]: all_funcs += helper(f"{prev} {comb}", depth + 1) return all_funcs ret = [] for vec in vectors: ret += helper(vec, 0) return ret def score_ngram(doc, model, tokenizer, n=3): """ Returns vector of ngram probabilities given document, model and tokenizer """ scores = [] tokens = ( tokenizer(doc.strip())[1:] if n == 1 else (n - 2) * [2] + tokenizer(doc.strip()) ) for i in ngrams(tokens, n): scores.append(model.n_gram_probability(i)) return np.array(scores) def get_all_logprobs( generate_dataset, preprocess=lambda x: x.strip(), verbose=True, trigram=None, tokenizer=None, num_tokens=2047, ): llama_logprobs = {} trigram_logprobs, unigram_logprobs = {}, {} if verbose: print("Loading logprobs into memory") file_names = generate_dataset(lambda file: file, verbose=False) to_iter = tqdm.tqdm(file_names) if verbose else file_names for file in to_iter: if "logprobs" in file: continue with open(file, "r") as f: doc = preprocess(f.read()) llama_logprobs[file] = get_logprobs( convert_file_to_logprob_file(file, "llama-7b") )[:num_tokens] trigram_logprobs[file] = score_ngram(doc, trigram, tokenizer, n=3)[:num_tokens] unigram_logprobs[file] = score_ngram(doc, trigram.base, tokenizer, n=1)[ :num_tokens ] return llama_logprobs, trigram_logprobs, unigram_logprobs all_funcs = backtrack_functions(max_depth=3) np.random.seed(0) # Construct the test/train split. Seed of 0 ensures seriality across # all files performing the same split. indices = np.arange(6000) np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] print("Train/Test Split:", train, test) generate_dataset_fn = get_generate_dataset(*datasets) labels = generate_dataset_fn( lambda file: 1 if any([m in file for m in ["gpt", "claude"]]) else 0 ) # Construct all indices def get_indices(filter_fn): where = np.where(generate_dataset_fn(filter_fn))[0] curr_train = [i for i in train if i in where] curr_test = [i for i in test if i in where] return curr_train, curr_test indices_dict = {} for model in models + ["human"]: train_indices, test_indices = get_indices( lambda file: 1 if model in file else 0, ) indices_dict[f"{model}_train"] = train_indices indices_dict[f"{model}_test"] = test_indices for model in models + ["human"]: for domain in domains: train_key = f"{model}_{domain}_train" test_key = f"{model}_{domain}_test" train_indices, test_indices = get_indices( lambda file: 1 if domain in file and model in file else 0, ) indices_dict[train_key] = train_indices indices_dict[test_key] = test_indices if args.feature_select: ( llama_logprobs, trigram_logprobs, unigram_logprobs, ) = get_all_logprobs( generate_dataset_fn, verbose=True, tokenizer=lambda x: tokenizer(x)["input_ids"], trigram=trigram, ) vector_map = { "llama-logprobs": lambda file: llama_logprobs[file], "trigram-logprobs": lambda file: trigram_logprobs[file], "unigram-logprobs": lambda file: unigram_logprobs[file], } def calc_features(file, exp): exp_tokens = get_words(exp) curr = vector_map[exp_tokens[0]](file) for i in range(1, len(exp_tokens)): if exp_tokens[i] in vec_functions: next_vec = vector_map[exp_tokens[i + 1]](file) curr = vec_functions[exp_tokens[i]](curr, next_vec) elif exp_tokens[i] in scalar_functions: return scalar_functions[exp_tokens[i]](curr) print("Preparing exp_to_data") exp_to_data = {} for exp in tqdm.tqdm(all_funcs): exp_to_data[exp] = generate_dataset_fn( lambda file: calc_features(file, exp) ).reshape(-1, 1) select_features(exp_to_data, labels, verbose=True, to_normalize=True, indices=train) if args.classify: ( llama_logprobs, trigram_logprobs, unigram_logprobs, ) = get_all_logprobs( generate_dataset_fn, verbose=True, tokenizer=lambda x: tokenizer(x)["input_ids"], trigram=trigram, ) vector_map = { "llama-logprobs": lambda file: llama_logprobs[file], "trigram-logprobs": lambda file: trigram_logprobs[file], "unigram-logprobs": lambda file: unigram_logprobs[file], } def get_exp_featurize(best_features, vector_map): def calc_features(file, exp): exp_tokens = get_words(exp) curr = vector_map[exp_tokens[0]](file) for i in range(1, len(exp_tokens)): if exp_tokens[i] in vec_functions: next_vec = vector_map[exp_tokens[i + 1]](file) curr = vec_functions[exp_tokens[i]](curr, next_vec) elif exp_tokens[i] in scalar_functions: return scalar_functions[exp_tokens[i]](curr) def exp_featurize(file): return np.array([calc_features(file, exp) for exp in best_features]) return exp_featurize data = generate_dataset_fn(get_exp_featurize(best_features, vector_map)) data = normalize(data) def train_llama(data, train, test): model = LogisticRegression() model.fit(data[train], labels[train]) return f1_score(labels[test], model.predict(data[test])) print( f"In-Domain: {train_llama(data, indices_dict['gpt_train'] + indices_dict['human_train'], indices_dict['gpt_test'] + indices_dict['human_test'])}" ) for test_domain in domains: train_indices = [] for train_domain in domains: if train_domain == test_domain: continue train_indices += ( indices_dict[f"gpt_{train_domain}_train"] + indices_dict[f"human_{train_domain}_train"] ) print( f"Out-Domain ({test_domain}): {train_llama(data, train_indices, indices_dict[f'gpt_{test_domain}_test'] + indices_dict[f'human_{test_domain}_test'])}" ) ================================================ FILE: model/features.txt ================================================ unigram-logprobs v-> davinci-logprobs s-var unigram-logprobs v-sub davinci-logprobs v-div unigram-logprobs s-avg-top-25 trigram-logprobs v-> ada-logprobs v-sub davinci-logprobs s-var unigram-logprobs v-> trigram-logprobs v-< davinci-logprobs s-avg unigram-logprobs v-< trigram-logprobs v-mul ada-logprobs s-avg unigram-logprobs v-< trigram-logprobs v-add ada-logprobs s-max trigram-logprobs v-< ada-logprobs v-> davinci-logprobs s-var trigram-logprobs v-> ada-logprobs v-> davinci-logprobs s-avg-top-25 ada-logprobs v-sub davinci-logprobs s-l2 davinci-logprobs s-var trigram-logprobs v-< davinci-logprobs v-div unigram-logprobs s-l2 trigram-logprobs v-> ada-logprobs v-mul davinci-logprobs s-avg ================================================ FILE: model/trigram_model.pkl ================================================ [File too large to display: 18.4 MB] ================================================ FILE: requirements.txt ================================================ tqdm scikit-learn numpy tenacity openai==0.28.1 torch tiktoken flask tabulate dill nltk datasets==2.16.1 transformers==4.36.2 matplotlib ================================================ FILE: results/best_features_custom.txt ================================================ trigram-logprobs v-< davinci-logprobs v-div unigram-logprobs s-l2 trigram-logprobs v-add ada-logprobs v-< davinci-logprobs s-avg-top-25 trigram-logprobs v-sub ada-logprobs v-> davinci-logprobs s-var trigram-logprobs v-div ada-logprobs s-avg trigram-logprobs v-div ada-logprobs v-div davinci-logprobs s-avg trigram-logprobs v-div unigram-logprobs v-> ada-logprobs s-l2 unigram-logprobs v-mul ada-logprobs v-sub davinci-logprobs s-len unigram-logprobs v-< ada-logprobs v-> davinci-logprobs s-avg-top-25 unigram-logprobs v-< ada-logprobs v-div davinci-logprobs s-min unigram-logprobs v-add trigram-logprobs v-add davinci-logprobs s-min ================================================ FILE: results/best_features_essay.txt ================================================ unigram-logprobs v-div ada-logprobs v-> davinci-logprobs s-avg davinci-logprobs s-avg ada-logprobs v-> davinci-logprobs v-div trigram-logprobs s-avg trigram-logprobs v-> davinci-logprobs s-var ================================================ FILE: results/best_features_four.txt ================================================ unigram-logprobs v-add trigram-logprobs v-> ada-logprobs v-sub davinci-logprobs s-var ada-logprobs v-sub davinci-logprobs v-div ada-logprobs v-< davinci-logprobs s-var unigram-logprobs v-> trigram-logprobs v-sub ada-logprobs v-add davinci-logprobs s-avg-top-25 unigram-logprobs v-> trigram-logprobs v-< ada-logprobs v-add davinci-logprobs s-avg trigram-logprobs v-mul davinci-logprobs v-div unigram-logprobs v-< trigram-logprobs s-var trigram-logprobs v-> davinci-logprobs v-div unigram-logprobs v-div davinci-logprobs s-var unigram-logprobs v-< trigram-logprobs v-div unigram-logprobs v-add ada-logprobs s-var ada-logprobs v-div trigram-logprobs v-sub ada-logprobs v-div davinci-logprobs s-max ================================================ FILE: results/best_features_no_gpt.txt ================================================ unigram-logprobs s-var trigram-logprobs v-div unigram-logprobs v-sub trigram-logprobs s-min trigram-logprobs v-div unigram-logprobs v-div trigram-logprobs s-min unigram-logprobs v-> trigram-logprobs s-avg unigram-logprobs v-< trigram-logprobs v-div unigram-logprobs s-avg unigram-logprobs v-mul trigram-logprobs s-avg-top-25 unigram-logprobs v-mul trigram-logprobs s-l2 trigram-logprobs s-l2 trigram-logprobs v-div unigram-logprobs v-sub trigram-logprobs s-l2 ================================================ FILE: results/best_features_old.txt ================================================ unigram-logprobs v-div ada-logprobs v-> davinci-logprobs s-avg unigram-logprobs v-> ada-logprobs v-sub davinci-logprobs s-var unigram-logprobs s-avg ada-logprobs s-min davinci-logprobs v-div trigram-logprobs v-mul ada-logprobs s-avg unigram-logprobs v-> davinci-logprobs s-min unigram-logprobs v-> ada-logprobs s-var ada-logprobs v-sub davinci-logprobs s-avg unigram-logprobs v-mul ada-logprobs s-avg trigram-logprobs v-< ada-logprobs s-avg unigram-logprobs v-< trigram-logprobs s-var unigram-logprobs v-> ada-logprobs v-> davinci-logprobs s-var unigram-logprobs v-add trigram-logprobs v-> davinci-logprobs s-avg unigram-logprobs v-add ada-logprobs v-sub davinci-logprobs s-avg davinci-logprobs v-div trigram-logprobs v-add davinci-logprobs s-min ================================================ FILE: results/best_features_one.txt ================================================ ada-logprobs s-avg davinci-logprobs s-var davinci-logprobs s-min trigram-logprobs s-var davinci-logprobs s-len davinci-logprobs s-max ================================================ FILE: results/best_features_only_ada.txt ================================================ unigram-logprobs v-add trigram-logprobs v-< ada-logprobs s-var unigram-logprobs v-sub ada-logprobs s-avg-top-25 ada-logprobs s-var unigram-logprobs v-mul ada-logprobs s-avg unigram-logprobs v-> trigram-logprobs v-div ada-logprobs s-avg unigram-logprobs v-< trigram-logprobs v-mul ada-logprobs s-avg unigram-logprobs s-avg trigram-logprobs v-div unigram-logprobs v-div ada-logprobs s-max ================================================ FILE: results/best_features_reuter.txt ================================================ unigram-logprobs v-> ada-logprobs v-sub davinci-logprobs s-var trigram-logprobs v-div ada-logprobs v-> davinci-logprobs s-avg ada-logprobs v-sub davinci-logprobs s-var unigram-logprobs v-> trigram-logprobs v-add ada-logprobs s-avg ada-logprobs v-> davinci-logprobs v-div trigram-logprobs s-avg-top-25 ================================================ FILE: results/best_features_three.txt ================================================ unigram-logprobs v-add trigram-logprobs v-< davinci-logprobs s-avg unigram-logprobs v-> ada-logprobs v-sub davinci-logprobs s-var unigram-logprobs v-sub ada-logprobs v-add davinci-logprobs s-avg-top-25 ada-logprobs v-> davinci-logprobs v-div ada-logprobs s-avg trigram-logprobs v-div ada-logprobs v-< davinci-logprobs s-avg unigram-logprobs v-mul davinci-logprobs s-max unigram-logprobs v-sub ada-logprobs v-mul davinci-logprobs s-avg unigram-logprobs v-mul trigram-logprobs v-sub ada-logprobs s-var trigram-logprobs v-sub davinci-logprobs v-div unigram-logprobs s-max ================================================ FILE: results/best_features_two.txt ================================================ unigram-logprobs v-< davinci-logprobs s-var trigram-logprobs v-> ada-logprobs s-avg ada-logprobs v-sub davinci-logprobs s-min unigram-logprobs v-sub ada-logprobs s-l2 unigram-logprobs v-sub davinci-logprobs s-avg-top-25 unigram-logprobs v-mul davinci-logprobs s-l2 ada-logprobs v-< davinci-logprobs s-avg trigram-logprobs v-mul ada-logprobs s-l2 ================================================ FILE: results/best_features_wp.txt ================================================ unigram-logprobs v-add trigram-logprobs v-< davinci-logprobs s-avg unigram-logprobs v-> trigram-logprobs v-> davinci-logprobs s-avg unigram-logprobs v-sub ada-logprobs v-> davinci-logprobs s-avg-top-25 unigram-logprobs v-mul davinci-logprobs s-var ada-logprobs v-div unigram-logprobs v-add ada-logprobs s-max unigram-logprobs v-sub davinci-logprobs v-div unigram-logprobs s-avg-top-25 davinci-logprobs v-div ada-logprobs v-sub davinci-logprobs s-var ================================================ FILE: results/ghostbuster.csv ================================================ Model Type,Experiment,Accuracy,F1,AUC Ghostbuster (Depth One),In-Domain (wp),0.948052,0.946524,0.979872 Ghostbuster (Depth One),In-Domain (reuter),0.995215,0.995215,0.999496 Ghostbuster (Depth One),In-Domain (essay),0.97733,0.97852,0.985001 Ghostbuster (Depth One),In-Domain,0.936667,0.936772,0.979032 Ghostbuster (Depth One),Out-Domain (wp),0.836364,0.813056,0.926055 Ghostbuster (Depth One),Out-Domain (reuter),0.935407,0.937063,0.983769 Ghostbuster (Depth One),Out-Domain (essay),0.848866,0.87395,0.962745 Ghostbuster (Depth One),Out-Domain (claude),0.841725,0.896527,0.957148 Ghostbuster (Depth One),Out-Domain (gpt_prompt1),0.908762,0.942857,0.975045 Ghostbuster (Depth One),Out-Domain (gpt_prompt2),0.910431,0.943961,0.974575 Ghostbuster (Depth One),Out-Domain (gpt_writing),0.899305,0.936558,0.972838 Ghostbuster (Depth One),Out-Domain (gpt_semantic),0.958275,0.974654,0.985868 Ghostbuster (Depth Two),In-Domain (wp),0.984416,0.983784,0.998109 Ghostbuster (Depth Two),In-Domain (reuter),0.995215,0.995215,0.999725 Ghostbuster (Depth Two),In-Domain (essay),0.997481,0.997625,0.999924 Ghostbuster (Depth Two),In-Domain,0.983333,0.983471,0.997694 Ghostbuster (Depth Two),Out-Domain (wp),0.955844,0.952381,0.995461 Ghostbuster (Depth Two),Out-Domain (reuter),0.95933,0.958838,0.991644 Ghostbuster (Depth Two),Out-Domain (essay),0.921914,0.931264,0.9904 Ghostbuster (Depth Two),Out-Domain (claude),0.85758,0.907044,0.987045 Ghostbuster (Depth Two),Out-Domain (gpt_prompt1),0.991099,0.994656,0.999174 Ghostbuster (Depth Two),Out-Domain (gpt_prompt2),0.991099,0.994656,0.999431 Ghostbuster (Depth Two),Out-Domain (gpt_writing),0.99388,0.996332,0.999353 Ghostbuster (Depth Two),Out-Domain (gpt_semantic),0.994159,0.996499,0.9994 Ghostbuster (Depth Three),In-Domain (wp),0.984416,0.983784,0.999406 Ghostbuster (Depth Three),In-Domain (reuter),0.995215,0.995192,0.999657 Ghostbuster (Depth Three),In-Domain (essay),0.994962,0.995261,0.99972 Ghostbuster (Depth Three),In-Domain,0.99,0.990066,0.998942 Ghostbuster (Depth Three),Out-Domain (wp),0.953247,0.949153,0.996407 Ghostbuster (Depth Three),Out-Domain (reuter),0.978469,0.978622,0.996474 Ghostbuster (Depth Three),Out-Domain (essay),0.974811,0.976744,0.999491 Ghostbuster (Depth Three),Out-Domain (claude),0.878999,0.921973,0.986922 Ghostbuster (Depth Three),Out-Domain (gpt_prompt1),0.989152,0.993468,0.998737 Ghostbuster (Depth Three),Out-Domain (gpt_prompt2),0.992768,0.995655,0.999135 Ghostbuster (Depth Three),Out-Domain (gpt_writing),0.994715,0.996829,0.998888 Ghostbuster (Depth Three),Out-Domain (gpt_semantic),0.995828,0.997498,0.999452 Ghostbuster (Depth Four),In-Domain (wp),0.979221,0.978261,0.997001 Ghostbuster (Depth Four),In-Domain (reuter),0.995215,0.995192,0.999886 Ghostbuster (Depth Four),In-Domain (essay),0.994962,0.995261,0.998803 Ghostbuster (Depth Four),In-Domain,0.9825,0.982688,0.996741 Ghostbuster (Depth Four),Out-Domain (wp),0.961039,0.958217,0.994272 Ghostbuster (Depth Four),Out-Domain (reuter),0.964115,0.9642,0.993681 Ghostbuster (Depth Four),Out-Domain (essay),0.97733,0.979021,0.997377 Ghostbuster (N-Gram Only),In-Domain (wp),0.94026,0.936986,0.985951 Ghostbuster (N-Gram Only),In-Domain (reuter),0.91866,0.917874,0.974085 Ghostbuster (N-Gram Only),In-Domain (essay),0.962217,0.964871,0.996562 Ghostbuster (N-Gram Only),In-Domain,0.88,0.88216,0.94893 Ghostbuster (N-Gram Only),Out-Domain (wp),0.818182,0.785276,0.919328 Ghostbuster (N-Gram Only),Out-Domain (reuter),0.73445,0.700809,0.806735 Ghostbuster (N-Gram Only),Out-Domain (essay),0.65995,0.754991,0.921416 Ghostbuster (N-Gram Only),Out-Domain (claude),0.836161,0.894198,0.932834 Ghostbuster (N-Gram Only),Out-Domain (gpt_prompt1),0.90904,0.943901,0.963673 Ghostbuster (N-Gram Only),Out-Domain (gpt_prompt2),0.901252,0.938804,0.958407 Ghostbuster (N-Gram Only),Out-Domain (gpt_writing),0.901252,0.938804,0.959402 Ghostbuster (N-Gram Only),Out-Domain (gpt_semantic),0.90904,0.943901,0.961151 Ghostbuster (N-Gram and Ada),In-Domain (wp),0.994805,0.994624,0.999568 Ghostbuster (N-Gram and Ada),In-Domain (reuter),0.992823,0.992771,0.999908 Ghostbuster (N-Gram and Ada),In-Domain (essay),0.997481,0.997625,0.99972 Ghostbuster (N-Gram and Ada),In-Domain,0.9875,0.987654,0.998267 Ghostbuster (N-Gram and Ada),Out-Domain (wp),0.914286,0.902655,0.996218 Ghostbuster (N-Gram and Ada),Out-Domain (reuter),0.973684,0.973747,0.995948 Ghostbuster (N-Gram and Ada),Out-Domain (essay),0.906801,0.919037,0.998243 Ghostbuster (N-Gram and Ada),Out-Domain (claude),0.717385,0.796962,0.965743 Ghostbuster (N-Gram and Ada),Out-Domain (gpt_prompt1),0.984145,0.990441,0.997964 Ghostbuster (N-Gram and Ada),Out-Domain (gpt_prompt2),0.983866,0.990272,0.998183 Ghostbuster (N-Gram and Ada),Out-Domain (gpt_writing),0.989986,0.993984,0.998328 Ghostbuster (N-Gram and Ada),Out-Domain (gpt_semantic),0.989708,0.993816,0.998736 "Ghostbuster (Depth Three, No Handcrafted)",In-Domain (wp),0.98961,0.989189,0.998595 "Ghostbuster (Depth Three, No Handcrafted)",In-Domain (reuter),0.990431,0.990431,0.999565 "Ghostbuster (Depth Three, No Handcrafted)",In-Domain (essay),0.994962,0.995261,0.999796 "Ghostbuster (Depth Three, No Handcrafted)",In-Domain,0.989167,0.989247,0.998692 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (wp),0.94026,0.934097,0.995218 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (reuter),0.978469,0.97852,0.996108 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (essay),0.972292,0.974478,0.999618 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (claude),0.888734,0.928724,0.987969 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (gpt_prompt1),0.987204,0.99229,0.998831 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (gpt_prompt2),0.991655,0.994985,0.999216 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (gpt_writing),0.99388,0.996327,0.999087 "Ghostbuster (Depth Three, No Handcrafted)",Out-Domain (gpt_semantic),0.994437,0.996662,0.99959 Ghostbuster (No Symbolic),In-Domain (wp),0.797403,0.793651,0.884665 Ghostbuster (No Symbolic),In-Domain (reuter),0.880383,0.878641,0.935784 Ghostbuster (No Symbolic),In-Domain (essay),0.937028,0.940618,0.988592 Ghostbuster (No Symbolic),In-Domain,0.803333,0.805281,0.896296 Ghostbuster (No Symbolic),Out-Domain (wp),0.742857,0.772414,0.869725 Ghostbuster (No Symbolic),Out-Domain (reuter),0.767943,0.758105,0.851377 Ghostbuster (No Symbolic),Out-Domain (essay),0.755668,0.771765,0.863305 Ghostbuster (No Symbolic),Out-Domain (claude),0.807789,0.875428,0.893528 Ghostbuster (No Symbolic),Out-Domain (gpt_prompt1),0.779138,0.854151,0.868459 Ghostbuster (No Symbolic),Out-Domain (gpt_prompt2),0.731015,0.816543,0.843016 Ghostbuster (No Symbolic),Out-Domain (gpt_writing),0.689569,0.782116,0.821094 Ghostbuster (No Symbolic),Out-Domain (gpt_semantic),0.85007,0.905422,0.914689 ================================================ FILE: results/other.csv ================================================ Model Type,Experiment,Accuracy,F1,AUC Ghostbuster,Out-Domain (lang8),0.955,0.0,-1 RoBERTa,Out-Domain (lang8),0.986,0.0,-1 Ghostbuster,Out-Domain (gptzero),0.9,0.9,0.9684 Ghostbuster,Out-Domain (ets),0.999,0.0,-1 Ghostbuster,In-Domain (ets),1.0,0.0,-1 RoBERTa,Out-Domain (ets),0.981,0.0,-1 ================================================ FILE: results/perturb.csv ================================================ Model Type,Experiment,Accuracy,F1,AUC Ghostbuster,"Out-Domain (letter, 0)",0.985,0.983957,0.998095 Ghostbuster,"Out-Domain (letter, 10)",0.905,0.888889,0.989474 Ghostbuster,"Out-Domain (letter, 25)",0.655,0.429752,0.970125 Ghostbuster,"Out-Domain (letter, 50)",0.535,0.041237,0.957594 Ghostbuster,"Out-Domain (letter, 100)",0.525,0.0,0.937343 Ghostbuster,"Out-Domain (letter, 200)",0.525,0.0,0.899348 Ghostbuster,"Out-Domain (word, 0)",0.985,0.983957,0.998095 Ghostbuster,"Out-Domain (word, 10)",0.82,0.766234,0.967519 Ghostbuster,"Out-Domain (word, 25)",0.555,0.118812,0.94807 Ghostbuster,"Out-Domain (word, 50)",0.525,0.0,0.918095 Ghostbuster,"Out-Domain (word, 100)",0.525,0.0,0.922907 Ghostbuster,"Out-Domain (word, 200)",0.525,0.0,0.915088 Ghostbuster,"Out-Domain (sentences, 0)",0.98,0.978723,0.997794 Ghostbuster,"Out-Domain (sentences, 10)",0.985,0.983957,0.994386 Ghostbuster,"Out-Domain (sentences, 25)",0.98,0.978723,0.993083 Ghostbuster,"Out-Domain (sentences, 50)",0.975,0.972973,0.995789 Ghostbuster,"Out-Domain (sentences, 100)",0.965,0.961749,0.994586 Ghostbuster,"Out-Domain (sentences, 200)",0.975,0.973262,0.994787 Ghostbuster,"Out-Domain (paragraphs, 0)",0.98,0.978723,0.998095 Ghostbuster,"Out-Domain (paragraphs, 10)",0.985,0.983957,0.997895 Ghostbuster,"Out-Domain (paragraphs, 25)",0.98,0.978495,0.997594 Ghostbuster,"Out-Domain (paragraphs, 50)",0.975,0.973262,0.997794 Ghostbuster,"Out-Domain (paragraphs, 100)",0.975,0.972973,0.998897 Ghostbuster,"Out-Domain (paragraphs, 200)",0.975,0.973262,0.998897 ================================================ FILE: results/roberta.csv ================================================ Model Type,Experiment,Accuracy,F1,AUC RoBERTa,In-Domain (wp),0.81039,0.820639,0.853947 RoBERTa,In-Domain (reuter),0.933014,0.929648,0.992697 RoBERTa,In-Domain (essay),0.962217,0.964706,0.993863 RoBERTa,In-Domain,0.903333,0.905691,0.957186 RoBERTa,Out-Domain (wp),0.974026,0.972376,0.998055 RoBERTa,Out-Domain (reuter),0.801435,0.832998,0.988233 RoBERTa,Out-Domain (essay),0.531486,0.693069,0.935803 RoBERTa,Out-Domain (claude),0.815021,0.87845,0.908494 RoBERTa,Out-Domain (gpt_prompt1),0.951043,0.97047,0.973395 RoBERTa,Out-Domain (gpt_prompt2),0.956885,0.974085,0.975836 RoBERTa,Out-Domain (gpt_writing),0.974965,0.985114,0.979904 RoBERTa,Out-Domain (gpt_semantic),0.943533,0.965785,0.970273 ================================================ FILE: roberta/roberta_results.csv ================================================ Model Type,Experiment,Accuracy,F1,AUC RoBERTa,In-Domain,0.985833,0.986145,0.99995 RoBERTa,In-Domain (wp),0.963636,0.963731,0.999838 RoBERTa,Out-Domain (wp),0.974026,0.973545,0.999271 RoBERTa,In-Domain (reuter),1.0,1.0,1.0 RoBERTa,Out-Domain (reuter),0.543062,0.15859,0.865456 RoBERTa,In-Domain (essay),0.992443,0.992908,1.0 RoBERTa,Out-Domain (essay),0.561713,0.707071,0.879081 ================================================ FILE: roberta/run_roberta.py ================================================ import argparse import math import numpy as np import torch import tqdm import csv from transformers import ( RobertaTokenizer, RobertaForSequenceClassification, ) from utils.load import get_generate_dataset, Dataset from sklearn.metrics import ( accuracy_score, f1_score, roc_auc_score, ) from torch.utils.data import Dataset as TorchDataset, DataLoader from torch.nn import functional as F if torch.cuda.is_available(): print("Using CUDA...") device = torch.device("cuda") else: print("Using CPU...") device = torch.device("cpu") models = ["gpt", "claude"] domains = ["wp", "reuter", "essay"] roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") wp_dataset = [ Dataset("normal", "../data/wp/human"), Dataset("normal", "../data/wp/gpt"), ] reuter_dataset = [ Dataset("author", "../data/reuter/human"), Dataset("author", "../data/reuter/gpt"), ] essay_dataset = [ Dataset("normal", "../data/essay/human"), Dataset("normal", "../data/essay/gpt"), ] class RobertaDataset(TorchDataset): def __init__(self, texts, labels): self.texts = texts self.labels = labels def __len__(self): return len(self.labels) def __getitem__(self, idx): encoding = roberta_tokenizer( self.texts[idx], return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) return { "input_ids": encoding["input_ids"].squeeze().to(device), "attention_mask": encoding["attention_mask"].squeeze().to(device), "labels": self.labels[idx], } def get_scores(labels, probabilities, calibrated=False, precision=6): if calibrated: threshold = sorted(probabilities)[len(labels) - sum(labels) - 1] else: threshold = 0.5 assert len(labels) == len(probabilities) if sum(labels) == 0: return ( round(accuracy_score(labels, probabilities > threshold), precision), round(f1_score(labels, probabilities > threshold), precision), -1, ) return ( round(accuracy_score(labels, probabilities > threshold), precision), round(f1_score(labels, probabilities > threshold), precision), round(roc_auc_score(labels, probabilities), precision), ) def train_roberta_model(train_text, train_labels, output_dir, max_epochs=1): roberta_model = RobertaForSequenceClassification.from_pretrained( "roberta-base", num_labels=2 ).to(device) optimizer = torch.optim.SGD(roberta_model.parameters(), lr=0.001) loss_fn = torch.nn.CrossEntropyLoss() print("Fine-tuning RoBERTa...") indices = np.arange(len(train_text)) np.random.shuffle(indices) train, valid = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) train_labels = np.array(train_labels) train_dataset = RobertaDataset([train_text[i] for i in train], train_labels[train]) val_dataset = RobertaDataset([train_text[i] for i in valid], train_labels[valid]) train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True) prev_val_loss = float("inf") for epoch in range(max_epochs): roberta_model.train() for batch in tqdm.tqdm(train_loader): optimizer.zero_grad() outputs = roberta_model(**batch) loss = loss_fn(outputs.logits.to(device), batch["labels"].to(device)) loss.backward() optimizer.step() del outputs, loss, batch roberta_model.eval() val_loss = 0 for batch in tqdm.tqdm(val_loader): outputs = roberta_model(**batch) loss = loss_fn(outputs.logits.to(device), batch["labels"].to(device)) val_loss += loss.item() del outputs, loss, batch val_loss /= len(val_loader) print(f"Epoch {epoch + 1} Validation Loss: {val_loss}") if val_loss > prev_val_loss: break prev_val_loss = val_loss roberta_model.save_pretrained(output_dir) def run_roberta_model(model_name, texts, labels): roberta_model = RobertaForSequenceClassification.from_pretrained( f"models/roberta_{model_name}", num_labels=2 ).to(device) roberta_model.eval() probs = [] for text in tqdm.tqdm(texts): encoding = roberta_tokenizer( text, return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) encoding = {k: v.to(device) for k, v in encoding.items()} with torch.no_grad(): outputs = roberta_model(**encoding) probs.append(float(F.softmax(outputs.logits, dim=1)[0][1].item())) del encoding, outputs return get_scores(np.array(labels), np.array(probs), calibrated=False) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--train", action="store_true") parser.add_argument("--run", action="store_true") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--output_file", default="roberta_results.csv") args = parser.parse_args() np.random.seed(args.seed) # Construct the test/train split. Seed of 0 ensures seriality across # all files performing the same split. indices = np.arange(6000) np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] print("Train/Test Split:", train, test) generate_dataset = get_generate_dataset( *wp_dataset, *reuter_dataset, *essay_dataset ) labels = generate_dataset(lambda f: "gpt" in f) assert len(labels) // 2 == sum(labels) if args.train: def train_roberta_gen( gen_fn, out_dir, indices=None, max_epochs=2, filter_fn=lambda x: True ): train_text, train_labels = [], [] if indices is not None: files = gen_fn(lambda f: f)[indices] else: files = gen_fn(lambda f: f) for file in files: if not filter_fn(file): continue with open(file) as f: text = f.read() train_text.append(text) train_labels.append(int("gpt" in file)) train_roberta_model( train_text, train_labels, out_dir, max_epochs=max_epochs ) gen_fn_all = get_generate_dataset(*wp_dataset, *reuter_dataset, *essay_dataset) gen_fn_wp = get_generate_dataset(*reuter_dataset, *essay_dataset) gen_fn_reuter = get_generate_dataset(*wp_dataset, *essay_dataset) gen_fn_essay = get_generate_dataset(*wp_dataset, *reuter_dataset) train_roberta_gen(gen_fn_all, "models/roberta_gpt", indices=train) train_roberta_gen(gen_fn_wp, "models/roberta_wp") train_roberta_gen(gen_fn_reuter, "models/roberta_reuter") train_roberta_gen(gen_fn_essay, "models/roberta_essay") if args.run: results_table = [ ["Model Type", "Experiment", "Accuracy", "F1", "AUC"], ] def get_data(gen_fn, indices=None, filter_fn=lambda f: True): if indices is not None: files = gen_fn(lambda f: f)[indices] else: files = gen_fn(lambda f: f) texts, labels = [], [] for file in files: if not filter_fn(file): continue with open(file) as f: text = f.read() texts.append(text) labels.append(int("gpt" in file)) return texts, labels gen_fn_all = get_generate_dataset(*wp_dataset, *reuter_dataset, *essay_dataset) gen_fn_wp = get_generate_dataset(*wp_dataset) gen_fn_reuter = get_generate_dataset(*reuter_dataset) gen_fn_essay = get_generate_dataset(*essay_dataset) results_table.append( [ "RoBERTa", "In-Domain", *run_roberta_model("gpt", *get_data(gen_fn_all, test)), ] ) for domain in ["wp", "reuter", "essay"]: results_table.append( [ "RoBERTa", f"In-Domain ({domain})", *run_roberta_model( "gpt" if domain != "reuter" else "only_reuter", *get_data(gen_fn_all, test, lambda x: domain in x), ), ] ) results_table.append( [ "RoBERTa", f"Out-Domain ({domain})", *run_roberta_model( domain, *get_data(gen_fn_all, test, lambda x: domain in x) ), ] ) if len(results_table) > 1: with open(args.output_file, "w") as f: writer = csv.writer(f) writer.writerows(results_table) print(f"Saved results to {args.output_file}") ================================================ FILE: roberta/train.py ================================================ import argparse import math import numpy as np import dill as pickle import torch import tqdm import itertools from transformers import ( RobertaTokenizer, RobertaForSequenceClassification, ) from utils.load import get_generate_dataset, Dataset from torch.utils.data import Dataset as TorchDataset, DataLoader if torch.cuda.is_available(): print("Using CUDA...") device = torch.device("cuda") else: print("Using CPU...") device = torch.device("cpu") models = ["gpt", "claude"] domains = ["wp", "reuter", "essay"] roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") wp_dataset = [ Dataset("normal", "../data/wp/human"), Dataset("normal", "../data/wp/gpt"), ] reuter_dataset = [ Dataset("author", "../data/reuter/human"), Dataset("author", "../data/reuter/gpt"), ] essay_dataset = [ Dataset("normal", "../data/essay/human"), Dataset("normal", "../data/essay/gpt"), ] class RobertaDataset(TorchDataset): def __init__(self, texts, labels): self.texts = texts self.labels = labels def __len__(self): return len(self.labels) def __getitem__(self, idx): encoding = roberta_tokenizer( self.texts[idx], return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) return { "input_ids": encoding["input_ids"].squeeze().to(device), "attention_mask": encoding["attention_mask"].squeeze().to(device), "labels": self.labels[idx], } def train_roberta_model(train_text, train_labels, output_dir): roberta_model = RobertaForSequenceClassification.from_pretrained( "roberta-large", num_labels=2 ).to(device) optimizer = torch.optim.AdamW(roberta_model.parameters(), lr=1e-5) loss_fn = torch.nn.CrossEntropyLoss() print("Fine-tuning RoBERTa...") indices = np.arange(len(train_text)) np.random.shuffle(indices) train, valid = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) train_labels = np.array(train_labels) train_dataset = RobertaDataset([train_text[i] for i in train], train_labels[train]) val_dataset = RobertaDataset([train_text[i] for i in valid], train_labels[valid]) train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True) prev_val_loss = float("inf") for epoch in range(1): roberta_model.train() for batch in tqdm.tqdm(train_loader): optimizer.zero_grad() outputs = roberta_model(**batch) loss = loss_fn(outputs.logits.to(device), batch["labels"].to(device)) loss.backward() optimizer.step() del outputs, loss, batch roberta_model.eval() val_loss = 0 for batch in tqdm.tqdm(val_loader): outputs = roberta_model(**batch) loss = loss_fn(outputs.logits.to(device), batch["labels"].to(device)) val_loss += loss.item() del outputs, loss, batch val_loss /= len(val_loader) print(f"Epoch {epoch + 1} Validation Loss: {val_loss}") if val_loss > prev_val_loss: break prev_val_loss = val_loss roberta_model.save_pretrained(output_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0) parser.add_argument("--train_all", action="store_true") args = parser.parse_args() np.random.seed(args.seed) datasets = [ *wp_dataset, *reuter_dataset, *essay_dataset, ] generate_dataset_fn = get_generate_dataset(*datasets) files = generate_dataset_fn(lambda x: x) labels = generate_dataset_fn( lambda file: 1 if any([m in file for m in ["gpt", "claude"]]) else 0 ) indices = np.arange(len(labels)) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) print("Train/Test Split:", train, test) # Construct all indices def get_indices(filter_fn): where = np.where(generate_dataset_fn(filter_fn))[0] curr_train = [i for i in train if i in where] curr_test = [i for i in test if i in where] return curr_train, curr_test def get_texts(indices): texts = [] for file in files[indices]: with open(file) as f: texts.append(f.read()) return texts indices_dict = {} for model in models + ["human"]: train_indices, test_indices = get_indices( lambda file: 1 if model in file else 0 ) indices_dict[f"{model}_train"] = train_indices indices_dict[f"{model}_test"] = test_indices for model in models + ["human"]: for domain in domains: train_key = f"{model}_{domain}_train" test_key = f"{model}_{domain}_test" train_indices, test_indices = get_indices( lambda file: 1 if domain in file and model in file else 0 ) indices_dict[train_key] = train_indices indices_dict[test_key] = test_indices if args.train_all: train_indices = [] for domain in domains: train_indices += ( indices_dict[f"gpt_{domain}_train"] + indices_dict[f"human_{domain}_train"] ) print("Training on GPT Data") print("# of Training Examples:", len(train_indices)) train_roberta_model( get_texts(train_indices), labels[train_indices], f"models/roberta_gpt", ) for test_domain in domains: train_indices = [] for train_domain in domains: if train_domain == test_domain: continue train_indices += ( indices_dict[f"gpt_{train_domain}_train"] + indices_dict[f"human_{train_domain}_train"] ) print(f"Training on OOD {test_domain} Data") print("# of Training Examples:", len(train_indices)) train_roberta_model( get_texts(train_indices), labels[train_indices], f"models/roberta_{test_domain}", ) ================================================ FILE: run.py ================================================ # Built-In Imports import csv import itertools import math import os from collections import defaultdict # External Imports import argparse import dill as pickle import matplotlib.pyplot as plt import numpy as np import tiktoken import tqdm # Torch imports import torch import torch.nn.functional as F from torch.utils.data import Dataset as TorchDataset from transformers import RobertaForSequenceClassification, RobertaTokenizer # Sklearn imports from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, f1_score, roc_auc_score from sklearn.model_selection import GridSearchCV from sklearn.calibration import CalibratedClassifierCV # Local Imports from utils.featurize import normalize, t_featurize, select_features from utils.symbolic import get_all_logprobs, get_exp_featurize, backtrack_functions from utils.load import Dataset, get_generate_dataset from generate import perturb_char_names, perturb_char_sizes from generate import perturb_sent_names, perturb_sent_sizes models = ["gpt"] domains = ["wp", "reuter", "essay"] eval_domains = ["claude", "gpt_prompt1", "gpt_prompt2", "gpt_writing", "gpt_semantic"] if torch.cuda.is_available(): print("Using CUDA...") device = torch.device("cuda") else: print("Using CPU...") device = torch.device("cpu") best_features_map = {} for file in os.listdir("results"): if file.startswith("best_features"): with open(f"results/{file}") as f: best_features_map[file[:-4]] = f.read().strip().split("\n") print("Loading trigram model...") trigram_model = pickle.load( open("model/trigram_model.pkl", "rb"), pickle.HIGHEST_PROTOCOL ) tokenizer = tiktoken.encoding_for_model("davinci").encode print("Loading features...") exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) t_data = pickle.load(open("t_data", "rb")) print("Loading eval data...") # exp_to_data_eval = pickle.load(open("symbolic_data_eval", "rb")) # t_data_eval = pickle.load(open("t_data_eval", "rb")) roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") datasets = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] eval_dataset = [ Dataset("normal", "data/wp/claude"), Dataset("author", "data/reuter/claude"), Dataset("normal", "data/essay/claude"), Dataset("normal", "data/wp/gpt_prompt1"), Dataset("author", "data/reuter/gpt_prompt1"), Dataset("normal", "data/essay/gpt_prompt1"), Dataset("normal", "data/wp/gpt_prompt2"), Dataset("author", "data/reuter/gpt_prompt2"), Dataset("normal", "data/essay/gpt_prompt2"), Dataset("normal", "data/wp/gpt_writing"), Dataset("author", "data/reuter/gpt_writing"), Dataset("normal", "data/essay/gpt_writing"), Dataset("normal", "data/wp/gpt_semantic"), Dataset("author", "data/reuter/gpt_semantic"), Dataset("normal", "data/essay/gpt_semantic"), ] other_dataset = [] class RobertaDataset(TorchDataset): def __init__(self, texts, labels): self.texts = texts self.labels = labels def __len__(self): return len(self.labels) def __getitem__(self, idx): encoding = roberta_tokenizer( self.texts[idx], return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) return { "input_ids": encoding["input_ids"].squeeze().to(device), "attention_mask": encoding["attention_mask"].squeeze().to(device), "labels": self.labels[idx], } def get_scores(labels, probabilities, calibrated=False, precision=6): if calibrated: threshold = sorted(probabilities)[len(labels) - sum(labels) - 1] else: threshold = 0.5 assert len(labels) == len(probabilities) if sum(labels) == 0 or sum(labels) == len(labels): return ( round(accuracy_score(labels, probabilities > threshold), precision), round(f1_score(labels, probabilities > threshold), precision), -1, ) return ( round(accuracy_score(labels, probabilities > threshold), precision), round(f1_score(labels, probabilities > threshold), precision), round(roc_auc_score(labels, probabilities), precision), ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--claude", action="store_true") parser.add_argument("--roberta", action="store_true") parser.add_argument("--perplexity_only", action="store_true") parser.add_argument("--ghostbuster", action="store_true") parser.add_argument("--ghostbuster_depth_one", action="store_true") parser.add_argument("--ghostbuster_depth_two", action="store_true") parser.add_argument("--ghostbuster_depth_three", action="store_true") parser.add_argument("--ghostbuster_depth_four", action="store_true") parser.add_argument("--ghostbuster_random", action="store_true") parser.add_argument("--ghostbuster_no_gpt", action="store_true") parser.add_argument("--ghostbuster_no_handcrafted", action="store_true") parser.add_argument("--ghostbuster_no_symbolic", action="store_true") parser.add_argument("--ghostbuster_only_ada", action="store_true") parser.add_argument("--ghostbuster_custom", action="store_true") parser.add_argument("--ghostbuster_other_eval", action="store_true") parser.add_argument("--ghostbuster_vary_training_data", action="store_true") parser.add_argument("--ghostbuster_vary_document_size", action="store_true") parser.add_argument("--hyperparameter_search", action="store_true") parser.add_argument("--perturb", action="store_true") parser.add_argument("--calibration", action="store_true") parser.add_argument("--toefl", action="store_true") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--output_file", type=str, default="results.csv") args = parser.parse_args() np.random.seed(args.seed) # Construct the test/train split. Seed of 0 ensures seriality across # all files performing the same split. indices = np.arange(6000) np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) # [4320 2006 5689 ... 4256 5807 4875] [5378 5980 5395 ... 1653 2607 2732] print("Train/Test Split:", train, test) # Results table, outputted to args.output_file. # Example Row: ["Ghostbuster (No GPT)", "WP", "gpt_wp", 0.5, 0.5, 0.5] results_table = [ ["Model Type", "Experiment", "Accuracy", "F1", "AUC"], ] # Construct the generate_dataset_fn. This function takes in a featurize function, # which is a mapping from a file location (str) to a desired feature vector. generate_dataset_fn_gpt = get_generate_dataset(*datasets) generate_dataset_fn_eval = get_generate_dataset(*eval_dataset) # t_data_eval = generate_dataset_fn_eval(t_featurize, verbose=True) # pickle.dump(t_data_eval, open("t_data_eval", "wb"), pickle.HIGHEST_PROTOCOL) generate_dataset_fn = get_generate_dataset(*datasets, *eval_dataset) # t_data = generate_dataset_fn(t_featurize, verbose=True) # pickle.dump(t_data, open("t_data", "wb"), pickle.HIGHEST_PROTOCOL) def get_featurized_data(best_features, gpt_only=False): gpt_data = np.concatenate( [t_data] + [exp_to_data[i] for i in best_features], axis=1 ) if gpt_only: return gpt_data eval_data = np.concatenate( [t_data_eval] + [exp_to_data_eval[i] for i in best_features], axis=1 ) return np.concatenate([gpt_data, eval_data], axis=0) # Construct all indices def get_indices(filter_fn): where = np.where(generate_dataset_fn_gpt(filter_fn))[0] curr_train = [i for i in train if i in where] curr_test = [i for i in test if i in where] return curr_train, curr_test indices_dict = {} for model in models + ["human"]: train_indices, test_indices = get_indices( lambda file: 1 if model in file else 0, ) indices_dict[f"{model}_train"] = train_indices indices_dict[f"{model}_test"] = test_indices for model in models + ["human"]: for domain in domains: train_key = f"{model}_{domain}_train" test_key = f"{model}_{domain}_test" train_indices, test_indices = get_indices( lambda file: 1 if domain in file and model in file else 0, ) indices_dict[train_key] = train_indices indices_dict[test_key] = test_indices for key in eval_domains: where = np.where(generate_dataset_fn(lambda file: 1 if key in file else 0))[0] assert len(where) == 3000 indices_dict[f"{key}_test"] = list(where) files = generate_dataset_fn(lambda x: x) labels = generate_dataset_fn( lambda file: 1 if any([m in file for m in ["gpt", "claude"]]) else 0 ) def get_roberta_predictions(data, train, test, domain): print(f"Loading model roberta/models/roberta_{domain}...") roberta_model = RobertaForSequenceClassification.from_pretrained( f"roberta/models/roberta_{domain}", num_labels=2 ) roberta_model.to(device) test_labels = labels[test] test_predictions = [] roberta_model.eval() with torch.no_grad(): for file in tqdm.tqdm(files[test]): with open(file) as f: text = f.read() inputs = roberta_tokenizer( text, return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) inputs = {k: v.to(device) for k, v in inputs.items()} outputs = roberta_model(**inputs) test_predictions.append( float(F.softmax(outputs.logits, dim=1)[0][1].item()) ) return get_scores(np.array(test_labels), np.array(test_predictions)) def train_ghostbuster(data, train, test, domain): model = LogisticRegression() model.fit(data[train], labels[train]) probs = model.predict_proba(data[test])[:, 1] return get_scores(labels[test], probs) def train_perplexity(data, train, test, domain): features = data[train][:, -1].reshape(-1, 1) threshold = sorted(features)[len(features) - sum(labels[train]) - 1] probs = (data[test][:, -1] > threshold).astype(float) return get_scores(labels[test], probs) def run_experiment(best_features, model_name, train_fn, gpt_only=True): gpt_data = get_featurized_data(best_features, gpt_only=True) _, mu, sigma = normalize(gpt_data, ret_mu_sigma=True) data = normalize( get_featurized_data(best_features, gpt_only=gpt_only), mu=mu, sigma=sigma ) print(f"Running {model_name} Predictions...") train_indices, test_indices = [], [] for domain in domains: train_indices += ( indices_dict[f"gpt_{domain}_train"] + indices_dict[f"human_{domain}_train"] ) test_indices += ( indices_dict[f"gpt_{domain}_test"] + indices_dict[f"human_{domain}_test"] ) results_table.append( [ model_name, f"In-Domain ({domain})", *train_fn( data, indices_dict[f"gpt_{domain}_train"] + indices_dict[f"human_{domain}_train"], indices_dict[f"gpt_{domain}_test"] + indices_dict[f"human_{domain}_test"], "gpt", ), ] ) results_table.append( [ model_name, "In-Domain", *train_fn(data, train_indices, test_indices, "gpt"), ] ) for test_domain in domains: train_indices = [] for train_domain in domains: if train_domain == test_domain: continue train_indices += ( indices_dict[f"gpt_{train_domain}_train"] + indices_dict[f"human_{train_domain}_train"] ) results_table.append( [ model_name, f"Out-Domain ({test_domain})", *train_fn( data, train_indices, indices_dict[f"gpt_{test_domain}_test"] + indices_dict[f"human_{test_domain}_test"], test_domain, ), ] ) if gpt_only: return train_indices, test_indices = [], [] for domain in domains: train_indices += ( indices_dict[f"gpt_{domain}_train"] + indices_dict[f"human_{domain}_train"] ) test_indices += indices_dict[f"human_{domain}_test"] for domain in eval_domains: curr_test_indices = list(indices_dict[f"{domain}_test"]) + test_indices results_table.append( [ model_name, f"Out-Domain ({domain})", *train_fn(data, train_indices, curr_test_indices, "gpt"), ] ) if args.perplexity_only: run_experiment( ["davinci-logprobs s-avg"], "Perplexity-Only", train_perplexity, ) if args.roberta: run_experiment([], "RoBERTa", get_roberta_predictions, gpt_only=True) if args.ghostbuster_depth_one or args.ghostbuster: run_experiment( best_features_map["best_features_one"], "Ghostbuster (Depth One)", train_ghostbuster, ) if args.ghostbuster_depth_two or args.ghostbuster: run_experiment( best_features_map["best_features_two"], "Ghostbuster (Depth Two)", train_ghostbuster, ) if args.ghostbuster_depth_three or args.ghostbuster: run_experiment( best_features_map["best_features_three"], "Ghostbuster (Depth Three)", train_ghostbuster, ) if args.ghostbuster_depth_four or args.ghostbuster: run_experiment( best_features_map["best_features_four"], "Ghostbuster (Depth Four)", train_ghostbuster, gpt_only=True, ) if args.ghostbuster_no_gpt or args.ghostbuster: run_experiment( best_features_map["best_features_no_gpt"], "Ghostbuster (N-Gram Only)", train_ghostbuster, ) if args.ghostbuster_only_ada or args.ghostbuster: run_experiment( best_features_map["best_features_only_ada"], "Ghostbuster (N-Gram and Ada)", train_ghostbuster, ) if args.ghostbuster_random or args.ghostbuster: all_features = backtrack_functions(max_depth=3) random_features = np.random.choice(all_features, 10, replace=False) run_experiment( random_features, "Ghostbuster (Random)", train_ghostbuster, ) if args.ghostbuster_custom: run_experiment( best_features_map["best_features_custom"], "Ghostbuster (Custom)", train_ghostbuster, ) if args.ghostbuster_no_handcrafted or args.ghostbuster: def train_ghostbuster_no_handcrafted(data, train, test, domain): data = data[:, 7:] return train_ghostbuster(data, train, test, domain) run_experiment( best_features_map["best_features_three"], "Ghostbuster (Depth Three, No Handcrafted)", train_ghostbuster_no_handcrafted, ) if args.ghostbuster_no_symbolic or args.ghostbuster: def train_ghostbuster_no_symbolic(data, train, test, domain): data = data[:, :7] return train_ghostbuster(data, train, test, domain) run_experiment( best_features_map["best_features_three"], "Ghostbuster (No Symbolic)", train_ghostbuster_no_symbolic, ) if args.ghostbuster_other_eval: data, mu, sigma = normalize( get_featurized_data( best_features_map["best_features_three"], gpt_only=True ), ret_mu_sigma=True, ) model = LogisticRegression() model.fit( data[indices_dict["gpt_train"] + indices_dict["human_train"]], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) # Get roberta results on ets roberta_model = RobertaForSequenceClassification.from_pretrained( f"roberta/models/roberta_gpt", num_labels=2 ) roberta_model.to(device) print( get_scores( labels[indices_dict["gpt_test"] + indices_dict["human_test"]], model.predict_proba( data[indices_dict["gpt_test"] + indices_dict["human_test"]] )[:, 1], ) ) other_datasets = [ Dataset("normal", "data/other/lang8"), Dataset("normal", "data/other/pelic"), Dataset("normal", "data/other/gptzero/gpt"), Dataset("normal", "data/other/gptzero/human"), ] def get_data(generate_dataset_fn, best_features): davinci, ada, trigram, unigram = get_all_logprobs( generate_dataset_fn, trigram=trigram_model, tokenizer=tokenizer, ) vector_map = { "davinci-logprobs": lambda file: davinci[file], "ada-logprobs": lambda file: ada[file], "trigram-logprobs": lambda file: trigram[file], "unigram-logprobs": lambda file: unigram[file], } exp_featurize = get_exp_featurize(best_features, vector_map) exp_data = generate_dataset_fn(exp_featurize) return exp_data def evaluate_on_dataset( model, best_features, curr_labels, generate_dataset_fn, dataset_name, train=False, to_normalize=True, ): data, mu, sigma = normalize( get_featurized_data(best_features, gpt_only=True), ret_mu_sigma=True, ) t_data = generate_dataset_fn(t_featurize, verbose=True) exp_data = get_data(generate_dataset_fn, best_features) if to_normalize: curr_data = normalize( np.concatenate([t_data, exp_data], axis=1), mu=mu, sigma=sigma ) else: curr_data = np.concatenate([t_data, exp_data], axis=1) if train: indices = np.arange(len(curr_data)) np.random.shuffle(indices) train_indices = indices[: math.floor(0.8 * len(indices))] test_indices = indices[math.floor(0.8 * len(indices)) :] curr_train_data = np.concatenate( [ curr_data[train_indices], data[indices_dict["gpt_train"] + indices_dict["human_train"]], ], axis=0, ) curr_train_labels = np.concatenate( [ curr_labels[train_indices], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ], axis=0, ) model = LogisticRegression() model.fit(curr_train_data, curr_train_labels) probs = model.predict_proba(curr_data[test_indices])[:, 1] results_table.append( [ "Ghostbuster", f"In-Domain ({dataset_name})", *get_scores(curr_labels[test_indices], probs), ] ) else: probs = model.predict_proba(curr_data)[:, 1] results_table.append( [ "Ghostbuster", f"Out-Domain ({dataset_name})", *get_scores(curr_labels, probs), ] ) for dataset in ["lang8"]: gen_fn = get_generate_dataset(Dataset("normal", f"data/other/{dataset}")) curr_labels = gen_fn(lambda _: 0) evaluate_on_dataset( model, best_features_map["best_features_three"], curr_labels, gen_fn, dataset, ) exp_data = get_data(gen_fn, ["davinci-logprobs s-avg"]) model_p = LogisticRegression() model_p.fit( exp_to_data["davinci-logprobs s-avg"][ indices_dict["gpt_train"] + indices_dict["human_train"] ], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) probs = model_p.predict_proba(exp_data)[:, 1] results_table.append( [ "Perplexity Only", f"Out-Domain (lang8)", *get_scores(curr_labels, probs), ] ) # Evaluate roberta roberta_test = RobertaDataset( gen_fn(lambda file: open(file).read()), gen_fn(lambda _: 0), ) roberta_test_loader = torch.utils.data.DataLoader( roberta_test, batch_size=1, shuffle=False ) roberta_model.eval() roberta_probs = [] with torch.no_grad(): for batch in tqdm.tqdm(roberta_test_loader): inputs = {k: v.to(device) for k, v in batch.items()} outputs = roberta_model(**inputs) roberta_probs.append( float(F.softmax(outputs.logits, dim=1)[0][1].item()) ) results_table.append( [ "RoBERTa", f"Out-Domain ({dataset})", *get_scores(gen_fn(lambda _: 0), np.array(roberta_probs)), ] ) gpt_zero = get_generate_dataset( Dataset("normal", f"data/other/gptzero/human"), Dataset("normal", f"data/other/gptzero/gpt"), ) curr_labels = np.array([0] * 50 + [1] * 50) evaluate_on_dataset( model, best_features_map["best_features_three"], curr_labels, gpt_zero, "gptzero", ) evaluate_on_dataset( model, best_features_map["best_features_three"], [1] * 100, get_generate_dataset(Dataset("normal", "data/other/undetectable")), "undetectable", ) gen_ets = get_generate_dataset(Dataset("normal", f"data/other/ets")) curr_labels = gen_ets(lambda _: 0) evaluate_on_dataset( model, best_features_map["best_features_three"], curr_labels, gen_ets, "ets", ) evaluate_on_dataset( model, best_features_map["best_features_three"], curr_labels, gen_ets, "ets", train=True, ) exp_data = get_data(gen_ets, ["davinci-logprobs s-avg"]) model = LogisticRegression() model.fit( exp_to_data["davinci-logprobs s-avg"][ indices_dict["gpt_train"] + indices_dict["human_train"] ], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) probs = model.predict_proba(exp_data)[:, 1] results_table.append( [ "Perplexity Only", f"Out-Domain (ets)", *get_scores(curr_labels, probs), ] ) roberta_test = RobertaDataset( gen_ets(lambda file: open(file).read()), gen_ets(lambda _: 0), ) roberta_test_loader = torch.utils.data.DataLoader( roberta_test, batch_size=1, shuffle=False ) roberta_model.eval() roberta_probs = [] with torch.no_grad(): for batch in tqdm.tqdm(roberta_test_loader): inputs = {k: v.to(device) for k, v in batch.items()} outputs = roberta_model(**inputs) roberta_probs.append( float(F.softmax(outputs.logits, dim=1)[0][1].item()) ) results_table.append( [ "RoBERTa", f"Out-Domain (ets)", *get_scores(gen_ets(lambda _: 0), np.array(roberta_probs)), ] ) if args.toefl: def get_data(generate_dataset_fn, best_features): davinci, ada, trigram, unigram = get_all_logprobs( generate_dataset_fn, trigram=trigram_model, tokenizer=tokenizer, ) vector_map = { "davinci-logprobs": lambda file: davinci[file], "ada-logprobs": lambda file: ada[file], "trigram-logprobs": lambda file: trigram[file], "unigram-logprobs": lambda file: unigram[file], } exp_featurize = get_exp_featurize(best_features, vector_map) exp_data = generate_dataset_fn(exp_featurize) t_data = generate_dataset_fn(t_featurize, verbose=True) return np.concatenate([t_data, exp_data], axis=1) # Evaluate on data contained in data/other/toefl91 data = get_featurized_data(best_features_map["best_features_three"]) data, mu, sigma = normalize(data, ret_mu_sigma=True) model = LogisticRegression() model.fit( data[indices_dict["gpt_train"] + indices_dict["human_train"]], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) print( f"Model F1: {f1_score(labels[indices_dict['gpt_test'] + indices_dict['human_test']], model.predict(data[indices_dict['gpt_test'] + indices_dict['human_test']]))}" ) toefl = get_generate_dataset(Dataset("normal", "data/other/toefl91")) toefl_labels = toefl(lambda _: 0) toefl_data = get_data(toefl, best_features_map["best_features_three"]) toefl_data = normalize(toefl_data, mu=mu, sigma=sigma) results_table.append( [ "Ghostbuster", f"Out-Domain (toefl)", accuracy_score(toefl_labels, model.predict(toefl_data)), ] ) # Do with perplexity only perplxity_data = get_featurized_data(["davinci-logprobs s-avg"], gpt_only=True) perplexity_model = LogisticRegression() perplexity_model.fit( perplxity_data[indices_dict["gpt_train"] + indices_dict["human_train"]], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) toefl_data = get_data(toefl, ["davinci-logprobs s-avg"]) results_table.append( [ "Perplexity Only", f"Out-Domain (toefl)", accuracy_score(toefl_labels, perplexity_model.predict(toefl_data)), ] ) # Do RoBERTa roberta_model = RobertaForSequenceClassification.from_pretrained( f"roberta/models/roberta_gpt", num_labels=2 ) roberta_model.to(device) roberta_predictions = [] with torch.no_grad(): for file in tqdm.tqdm(toefl(lambda file: file)): with open(file) as f: text = f.read() inputs = roberta_tokenizer( text, return_tensors="pt", truncation=True, padding="max_length", max_length=512, ) inputs = {k: v.to(device) for k, v in inputs.items()} outputs = roberta_model(**inputs) roberta_predictions.append( float(F.softmax(outputs.logits, dim=1)[0][1].item()) ) results_table.append( [ "RoBERTa", f"Out-Domain (toefl)", accuracy_score(toefl_labels, np.array(roberta_predictions) > 0.5), ] ) results_table.append(["GPT Zero", f"Out-Domain (toefl)", 0.923077]) results_table.append(["DetectGPT", f"Out-Domain (toefl)", 0.6373626373626373]) if args.ghostbuster_vary_training_data: exp_to_data_three = pickle.load(open("symbolic_data_gpt", "rb")) train_indices = indices_dict["gpt_train"] + indices_dict["human_train"] test_indices = indices_dict["gpt_test"] + indices_dict["human_test"] np.random.shuffle(train_indices) claude_test_indices = ( list(indices_dict["claude_test"]) + indices_dict["human_test"] ) scores = [] train_sizes = [int(125 * (2**i)) for i in range(6)] + [len(train_indices)] print(train_sizes) for size in tqdm.tqdm(train_sizes): print(f"Now running size: {size}") curr_train_indices = train_indices[:size] curr_best_features = select_features( exp_to_data_three, labels, verbose=True, to_normalize=True, indices=curr_train_indices, ) data = normalize(get_featurized_data(curr_best_features)) curr_score_vec = [] print(data[curr_train_indices].shape) model = LogisticRegression() model.fit(data[curr_train_indices], labels[curr_train_indices]) curr_score_vec.append( f1_score(labels[test_indices], model.predict(data[test_indices])) ) curr_score_vec.append( f1_score( labels[claude_test_indices], model.predict(data[claude_test_indices]), ) ) for test_domain in domains: domain_train_indices = [] for train_domain in domains: if train_domain == test_domain: continue domain_train_indices += ( indices_dict[f"gpt_{train_domain}_train"] + indices_dict[f"human_{train_domain}_train"] ) domain_train_indices = np.intersect1d( domain_train_indices, curr_train_indices ) domain_test_indices = ( indices_dict[f"gpt_{test_domain}_test"] + indices_dict[f"human_{test_domain}_test"] ) model = LogisticRegression() model.fit(data[domain_train_indices], labels[domain_train_indices]) curr_score_vec.append( f1_score( labels[domain_test_indices], model.predict(data[domain_test_indices]), ) ) scores.append(curr_score_vec) scores = np.array(scores) np.save("results/training_size.npy", scores) plt.plot(train_sizes, scores[:, 0], label="In-Domain") plt.plot(train_sizes, scores[:, 1], label="Out-Domain (Claude)") plt.plot(train_sizes, scores[:, 2], label="Out-Domain (WP)") plt.plot(train_sizes, scores[:, 3], label="Out-Domain (Reuter)") plt.plot(train_sizes, scores[:, 4], label="Out-Domain (Essay)") plt.xlabel("Training Size (# of Documents)") plt.ylabel("F1 Score") plt.legend() plt.savefig("results/training_size.png") if args.ghostbuster_vary_document_size: token_sizes = [10, 25, 50, 100, 250, 500, 1000] scores = [] train_indices = indices_dict["gpt_train"] + indices_dict["human_train"] test_indices = indices_dict["gpt_test"] + indices_dict["human_test"] claude_test_indices = ( list(indices_dict["claude_test"]) + indices_dict["human_test"] ) data = get_featurized_data(best_features_map["best_features_three"]) data, mu, sigma = normalize(data, ret_mu_sigma=True) for num_tokens in tqdm.tqdm(token_sizes): print(f"Now running size: {num_tokens}") curr_t_data = generate_dataset_fn( lambda file: t_featurize(file, num_tokens=num_tokens), verbose=True ) davinci, ada, trigram, unigram = get_all_logprobs( generate_dataset_fn, trigram=trigram_model, tokenizer=tokenizer, num_tokens=num_tokens, ) vector_map = { "davinci-logprobs": lambda file: davinci[file], "ada-logprobs": lambda file: ada[file], "trigram-logprobs": lambda file: trigram[file], "unigram-logprobs": lambda file: unigram[file], } exp_featurize = get_exp_featurize( best_features_map["best_features_three"], vector_map ) curr_exp_data = generate_dataset_fn(exp_featurize) curr_data = np.concatenate([curr_t_data, curr_exp_data], axis=1) curr_data = normalize(curr_data, mu=mu, sigma=sigma) curr_score_vec = [] print(data.shape) model = LogisticRegression() model.fit(data[train_indices], labels[train_indices]) curr_score_vec.append( f1_score(labels[test_indices], model.predict(curr_data[test_indices])) ) curr_score_vec.append( f1_score( labels[claude_test_indices], model.predict(curr_data[claude_test_indices]), ) ) for test_domain in domains: domain_train_indices = [] for train_domain in domains: if train_domain == test_domain: continue domain_train_indices += ( indices_dict[f"gpt_{train_domain}_train"] + indices_dict[f"human_{train_domain}_train"] ) domain_train_indices = np.intersect1d( domain_train_indices, train_indices ) domain_test_indices = ( indices_dict[f"gpt_{test_domain}_test"] + indices_dict[f"human_{test_domain}_test"] ) model = LogisticRegression() model.fit(data[domain_train_indices], labels[domain_train_indices]) curr_score_vec.append( f1_score( labels[domain_test_indices], model.predict(curr_data[domain_test_indices]), ) ) scores.append(curr_score_vec) print(curr_score_vec) scores = np.array(scores) np.save("results/document_size.npy", scores) plt.plot(token_sizes, scores[:, 0], label="In-Domain") plt.plot(token_sizes, scores[:, 1], label="Out-Domain (Claude)") plt.plot(token_sizes, scores[:, 2], label="Out-Domain (WP)") plt.plot(token_sizes, scores[:, 3], label="Out-Domain (Reuter)") plt.plot(token_sizes, scores[:, 4], label="Out-Domain (Essay)") plt.xlabel("Document Size (# of Tokens)") plt.ylabel("F1 Score") plt.legend() plt.savefig("results/document_size.png") if args.hyperparameter_search: data = normalize(get_featurized_data(best_features_map["best_features_three"])) param_grid = { "C": [ 0.01, 0.1, 0.125, 0.25, 0.375, 0.5, 0.675, 0.75, 0.875, 1, 2, 4, 8, 10, ], "penalty": ["l1", "l2", "elasticnet", None], } model = LogisticRegression() grid_search = GridSearchCV( model, param_grid=param_grid, scoring="roc_auc", cv=5, verbose=1 ) grid_search.fit(data[train], labels[train]) print(grid_search.best_params_) model = LogisticRegression( C=grid_search.best_params_["C"], penalty=grid_search.best_params_["penalty"], ) model.fit(data[train], labels[train]) probs = model.predict_proba(data[test])[:, 1] print(get_scores(labels[test], probs)) if args.perturb: data = get_featurized_data(best_features_map["best_features_three"]) data, mu, sigma = normalize(data, ret_mu_sigma=True) model = LogisticRegression() model.fit( data[indices_dict["gpt_train"] + indices_dict["human_train"]], labels[indices_dict["gpt_train"] + indices_dict["human_train"]], ) with open("data/perturb/labels.txt") as f: perturb_labels = np.array([int(i) for i in f.read().split("\n")]) def get_data(generate_dataset_fn, best_features): t_data = generate_dataset_fn(t_featurize, verbose=False) davinci, ada, trigram, unigram = get_all_logprobs( generate_dataset_fn, trigram=trigram_model, tokenizer=tokenizer, verbose=False, ) vector_map = { "davinci-logprobs": lambda file: davinci[file], "ada-logprobs": lambda file: ada[file], "trigram-logprobs": lambda file: trigram[file], "unigram-logprobs": lambda file: unigram[file], } exp_featurize = get_exp_featurize(best_features, vector_map) exp_data = generate_dataset_fn(exp_featurize, verbose=False) return np.concatenate([t_data, exp_data], axis=1) def get_perturb_data(perturb_names, perturb_sizes, save_file): data = defaultdict(list) for perturb_type in tqdm.tqdm(perturb_names): for n in perturb_sizes: gen_fn = get_generate_dataset( Dataset("normal", f"data/perturb/{perturb_type}/{n}") ) curr_labels = gen_fn( lambda file: perturb_labels[ int(os.path.basename(file).split(".")[0]) ] ) curr_data = get_data( gen_fn, best_features_map["best_features_three"] ) curr_data = (curr_data - mu) / sigma probs = model.predict_proba(curr_data)[:, 1] results_table.append( [ "Ghostbuster", f"Out-Domain ({perturb_type}, {n})", *get_scores(curr_labels, probs), ] ) _, f1, _ = get_scores(curr_labels, probs) data[perturb_type].append(f1) np.save(save_file, data) return data perturb_char_data = get_perturb_data( perturb_char_names, perturb_char_sizes, "results/perturb_char.npy" ) for perturb_type in perturb_char_names: plt.plot( perturb_char_sizes, perturb_char_data[perturb_type], label=perturb_type, ) plt.xlabel("Number of Perturbations") plt.ylabel("F1 Score") plt.legend() plt.savefig("results/perturb_char.png") plt.clf() perturb_sent_data = get_perturb_data( perturb_sent_names, perturb_sent_sizes, "results/perturb_sent.npy" ) for perturb_type in perturb_sent_names: plt.plot( perturb_sent_sizes, perturb_sent_data[perturb_type], label=perturb_type, ) plt.xlabel("Number of Perturbations") plt.ylabel("F1 Score") plt.legend() plt.savefig("results/perturb_sent.png") if args.calibration: def calculate_ece(y_true, y_probs, n_bins=10): """Compute ECE""" bin_lowers = np.linspace(0.0, 1.0 - 1.0 / n_bins, n_bins) bin_uppers = np.linspace(1.0 / n_bins, 1.0, n_bins) ece = 0.0 for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): in_bin = np.logical_and(bin_lower <= y_probs, y_probs < bin_upper) prop_in_bin = np.mean(in_bin) if prop_in_bin > 0: y_true_bin = y_true[in_bin] avg_confidence_in_bin = np.mean(y_probs[in_bin]) avg_accuracy_in_bin = np.mean(y_true_bin) ece += ( np.abs(avg_accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin ) return ece def train_ghostbuster_ece(data, train, test, domain): model = LogisticRegression() model.fit(data[train], labels[train]) probs = model.predict_proba(data[test])[:, 1] return [calculate_ece(labels[test], probs)] def train_ghostbuster_calibrated_ece(data, train, test, domain): clf = LogisticRegression() model = CalibratedClassifierCV(clf, method="isotonic", cv=5) model.fit(data[train], labels[train]) probs = model.predict_proba(data[test])[:, 1] return [calculate_ece(labels[test], probs)] run_experiment( best_features_map["best_features_three"], "Ghostbuster (Uncalibrated)", train_ghostbuster_ece, ) run_experiment( best_features_map["best_features_three"], "Ghostbuster (Calibrated)", train_ghostbuster_calibrated_ece, ) if len(results_table) > 1: # Write data to output csv file with open(args.output_file, "w") as f: writer = csv.writer(f) writer.writerows(results_table) print(f"Saved results to {args.output_file}") ================================================ FILE: setup.py ================================================ import os from setuptools import setup, find_packages setup(name="ghostbuster", version="1.0", packages=find_packages()) ================================================ FILE: train.py ================================================ import argparse import math import numpy as np import tiktoken import dill as pickle from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score, accuracy_score, roc_auc_score from sklearn.calibration import CalibratedClassifierCV from tabulate import tabulate from utils.featurize import normalize, t_featurize, select_features from utils.symbolic import get_all_logprobs, train_trigram, get_exp_featurize from utils.symbolic import generate_symbolic_data from utils.load import get_generate_dataset, Dataset with open("results/best_features_four.txt") as f: best_features = f.read().strip().split("\n") print("Loading trigram model...") trigram_model = pickle.load( open("model/trigram_model.pkl", "rb"), pickle.HIGHEST_PROTOCOL ) tokenizer = tiktoken.encoding_for_model("davinci").encode wp_dataset = [ Dataset("normal", "data/wp/human"), Dataset("normal", "data/wp/gpt"), ] reuter_dataset = [ Dataset("author", "data/reuter/human"), Dataset("author", "data/reuter/gpt"), ] essay_dataset = [ Dataset("normal", "data/essay/human"), Dataset("normal", "data/essay/gpt"), ] eval_dataset = [ Dataset("normal", "data/wp/claude"), Dataset("author", "data/reuter/claude"), Dataset("normal", "data/essay/claude"), Dataset("normal", "data/wp/gpt_prompt1"), Dataset("author", "data/reuter/gpt_prompt1"), Dataset("normal", "data/essay/gpt_prompt1"), Dataset("normal", "data/wp/gpt_prompt2"), Dataset("author", "data/reuter/gpt_prompt2"), Dataset("normal", "data/essay/gpt_prompt2"), Dataset("normal", "data/wp/gpt_writing"), Dataset("author", "data/reuter/gpt_writing"), Dataset("normal", "data/essay/gpt_writing"), Dataset("normal", "data/wp/gpt_semantic"), Dataset("author", "data/reuter/gpt_semantic"), Dataset("normal", "data/essay/gpt_semantic"), ] def get_featurized_data(generate_dataset_fn, best_features): t_data = generate_dataset_fn(t_featurize) davinci, ada, trigram, unigram = get_all_logprobs( generate_dataset_fn, trigram=trigram_model, tokenizer=tokenizer ) vector_map = { "davinci-logprobs": lambda file: davinci[file], "ada-logprobs": lambda file: ada[file], "trigram-logprobs": lambda file: trigram[file], "unigram-logprobs": lambda file: unigram[file], } exp_featurize = get_exp_featurize(best_features, vector_map) exp_data = generate_dataset_fn(exp_featurize) return np.concatenate([t_data, exp_data], axis=1) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--generate_symbolic_data", action="store_true") parser.add_argument("--generate_symbolic_data_four", action="store_true") parser.add_argument("--generate_symbolic_data_eval", action="store_true") parser.add_argument("--perform_feature_selection", action="store_true") parser.add_argument("--perform_feature_selection_one", action="store_true") parser.add_argument("--perform_feature_selection_two", action="store_true") parser.add_argument("--perform_feature_selection_four", action="store_true") parser.add_argument("--perform_feature_selection_only_ada", action="store_true") parser.add_argument("--perform_feature_selection_no_gpt", action="store_true") parser.add_argument("--perform_feature_selection_domain", action="store_true") parser.add_argument("--only_include_gpt", action="store_true") parser.add_argument("--train_on_all_data", action="store_true") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() np.random.seed(args.seed) result_table = [["F1", "Accuracy", "AUC"]] datasets = [ *wp_dataset, *reuter_dataset, *essay_dataset, ] generate_dataset_fn = get_generate_dataset(*datasets) if args.generate_symbolic_data: generate_symbolic_data( generate_dataset_fn, max_depth=3, output_file="symbolic_data_gpt", verbose=True, ) t_data = generate_dataset_fn(t_featurize) pickle.dump(t_data, open("t_data", "wb")) if args.generate_symbolic_data_eval: generate_dataset_fn_eval = get_generate_dataset(*eval_dataset) generate_symbolic_data( generate_dataset_fn_eval, max_depth=3, output_file="symbolic_data_eval", verbose=True, ) t_data_eval = generate_dataset_fn_eval(t_featurize) pickle.dump(t_data_eval, open("t_data_eval", "wb")) if args.generate_symbolic_data_four: generate_symbolic_data( generate_dataset_fn, max_depth=4, output_file="symbolic_data_gpt_four", verbose=True, ) t_data = generate_dataset_fn(t_featurize) pickle.dump(t_data, open("t_data", "wb")) labels = generate_dataset_fn( lambda file: 1 if any([m in file for m in ["gpt", "claude"]]) else 0 ) indices = np.arange(len(labels)) if args.only_include_gpt: where_gpt = np.where( generate_dataset_fn(lambda file: 0 if "claude" in file else 1) )[0] indices = indices[where_gpt] np.random.shuffle(indices) train, test = ( indices[: math.floor(0.8 * len(indices))], indices[math.floor(0.8 * len(indices)) :], ) print("Train/Test Split", train, test) print("Train Size:", len(train), "Valid Size:", len(test)) print(f"Positive Labels: {sum(labels[indices])}, Total Labels: {len(indices)}") if args.perform_feature_selection: exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_three.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_two: old_exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) exp_to_data = {} for key in old_exp_to_data: if len(key.split(" ")) <= 4: exp_to_data[key] = old_exp_to_data[key] best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_two.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_one: old_exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) exp_to_data = {} for key in old_exp_to_data: if len(key.split(" ")) <= 2: exp_to_data[key] = old_exp_to_data[key] best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_one.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_four: exp_to_data = pickle.load(open("symbolic_data_gpt_four", "rb")) best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_four.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_no_gpt: old_exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) exp_to_data = {} for key in old_exp_to_data: if "ada" not in key and "davinci" not in key: exp_to_data[key] = old_exp_to_data[key] best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_no_gpt.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_only_ada: old_exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) exp_to_data = {} for key in old_exp_to_data: if "davinci" not in key: exp_to_data[key] = old_exp_to_data[key] best_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=train ) with open("results/best_features_only_ada.txt", "w") as f: for feat in best_features: f.write(feat + "\n") if args.perform_feature_selection_domain: exp_to_data = pickle.load(open("symbolic_data_gpt", "rb")) wp_indices = np.where(generate_dataset_fn(lambda file: "wp" in file))[0] reuter_indices = np.where(generate_dataset_fn(lambda file: "reuter" in file))[0] essay_indices = np.where(generate_dataset_fn(lambda file: "essay" in file))[0] wp_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=wp_indices ) with open("results/best_features_wp.txt", "w") as f: for feat in wp_features: f.write(feat + "\n") reuter_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=reuter_indices ) with open("results/best_features_reuter.txt", "w") as f: for feat in reuter_features: f.write(feat + "\n") essay_features = select_features( exp_to_data, labels, verbose=True, to_normalize=True, indices=essay_indices ) with open("results/best_features_essay.txt", "w") as f: for feat in essay_features: f.write(feat + "\n") data, mu, sigma = normalize( get_featurized_data(generate_dataset_fn, best_features), ret_mu_sigma=True ) print(f"Best Features: {best_features}") print(f"Data Shape: {data.shape}") base = LogisticRegression() model = CalibratedClassifierCV(base, cv=5) if args.train_on_all_data: model.fit(data, labels) with open("model/features.txt", "w") as f: for feat in best_features: f.write(feat + "\n") pickle.dump(model, open("model/model", "wb")) pickle.dump(mu, open("model/mu", "wb")) pickle.dump(sigma, open("model/sigma", "wb")) print("Saved model to model/") else: model.fit(data[train], labels[train]) predictions = model.predict(data[test]) probs = model.predict_proba(data[test])[:, 1] result_table.append( [ round(f1_score(labels[test], predictions), 3), round(accuracy_score(labels[test], predictions), 3), round(roc_auc_score(labels[test], probs), 3), ] ) print(tabulate(result_table, headers="firstrow", tablefmt="grid")) ================================================ FILE: utils/__init__.py ================================================ import openai from .write_logprobs import * from .featurize import * from .n_gram import * openai_path = "" if os.path.exists("../../openai.config"): openai_path = "../../openai.config" elif os.path.exists("../openai.config"): openai_path = "../openai.config" elif os.path.exists("openai.config"): openai_path = "openai.config" if openai_path: openai_config = json.loads(open(openai_path).read()) openai.api_key = openai_config["api_key"] openai.organization = openai_config["organization"] ================================================ FILE: utils/featurize.py ================================================ import numpy as np import os import tqdm from nltk import ngrams from utils.score import k_fold_score def get_logprobs(file): """ Returns a vector containing all the logprobs from a given logprobs file """ logprobs = [] with open(file) as f: for line in f.read().strip().split("\n"): line = line.split(" ") logprobs.append(np.exp(-float(line[1]))) return np.array(logprobs) def get_tokens(file): """ Returns a list of all tokens from a given logprobs file """ with open(file) as f: tokens = list(map(lambda x: x.split(" ")[0], f.read().strip().split("\n"))) return tokens def get_token_len(tokens): """ Returns a vector of word lengths, in tokens """ tokens_len = [] curr = 0 for token in tokens: if token[0] == "Ġ": tokens_len.append(curr) curr = 1 else: curr += 1 return np.array(tokens_len) def get_diff(file1, file2): """ Returns difference in logprobs bewteen file1 and file2 """ return get_logprobs(file1) - get_logprobs(file2) def convolve(X, window=100): """ Returns a vector of running average with window size """ ret = [] for i in range(len(X) - window): ret.append(np.mean(X[i : i + window])) return np.array(ret) def score_ngram(doc, model, tokenizer, n=3, strip_first=False): """ Returns vector of ngram probabilities given document, model and tokenizer """ scores = [] if strip_first: doc = " ".join(doc.split()[:1000]) for i in ngrams((n - 1) * [50256] + tokenizer(doc.strip()), n): scores.append(model.n_gram_probability(i)) return np.array(scores) def normalize(data, mu=None, sigma=None, ret_mu_sigma=False): """ Normalizes data, where data is a matrix where the first dimension is the number of examples """ if mu is None: mu = np.mean(data, axis=0) if sigma is None: raw_std = np.std(data, axis=0) sigma = np.ones_like(raw_std) sigma[raw_std != 0] = raw_std[raw_std != 0] if ret_mu_sigma: return (data - mu) / sigma, mu, sigma else: return (data - mu) / sigma def convert_file_to_logprob_file(file_name, model): """ Removes the extension of file_name, then goes to the logprobs folder of the current directory, and appends a -{model}.txt to it. Example: convert_file_to_logprob_file("data/test.txt", "davinci") = "data/logprobs/test-davinci.txt" """ directory = os.path.dirname(file_name) base_name = os.path.basename(file_name) file_name_without_ext = os.path.splitext(base_name)[0] logprob_directory = os.path.join(directory, "logprobs") logprob_file_name = f"{file_name_without_ext}-{model}.txt" logprob_file_path = os.path.join(logprob_directory, logprob_file_name) return logprob_file_path def t_featurize_logprobs(davinci_logprobs, ada_logprobs, tokens): X = [] outliers = [] for logprob in davinci_logprobs: if logprob > 3: outliers.append(logprob) X.append(len(outliers)) outliers += [0] * (50 - len(outliers)) X.append(np.mean(outliers[:25])) X.append(np.mean(outliers[25:50])) diffs = sorted(davinci_logprobs - ada_logprobs, reverse=True) diffs += [0] * (50 - min(50, len(diffs))) X.append(np.mean(diffs[:25])) X.append(np.mean(diffs[25:])) token_len = sorted(get_token_len(tokens), reverse=True) token_len += [0] * (50 - min(50, len(token_len))) X.append(np.mean(token_len[:25])) X.append(np.mean(token_len[25:])) return X def t_featurize(file, num_tokens=2048): """ Manually handcrafted features for classification. """ davinci_file = convert_file_to_logprob_file(file, "davinci") ada_file = convert_file_to_logprob_file(file, "ada") davinci_logprobs = get_logprobs(davinci_file)[:num_tokens] ada_logprobs = get_logprobs(ada_file)[:num_tokens] tokens = get_tokens(davinci_file)[:num_tokens] return t_featurize_logprobs(davinci_logprobs, ada_logprobs, tokens) def select_features(exp_to_data, labels, verbose=True, to_normalize=True, indices=None): if to_normalize: normalized_exp_to_data = {} for key in exp_to_data: normalized_exp_to_data[key] = normalize(exp_to_data[key]) else: normalized_exp_to_data = exp_to_data def get_data(*exp): return np.concatenate([normalized_exp_to_data[e] for e in exp], axis=1) val_exp = list(exp_to_data.keys()) curr = 0 best_features = [] i = 0 while val_exp: best_score, best_exp = -1, "" for exp in tqdm.tqdm(val_exp) if verbose else val_exp: score = k_fold_score( get_data(*best_features, exp), labels, k=5, indices=indices ) if score > best_score: best_score = score best_exp = exp if verbose: print( f"Iteration {i}, Current Score: {curr}, \ Best Feature: {best_exp}, New Score: {best_score}" ) if best_score <= curr: break else: best_features.append(best_exp) val_exp.remove(best_exp) curr = best_score i += 1 return best_features ================================================ FILE: utils/generate.py ================================================ import math import os import tqdm import openai from utils import write_logprobs from tenacity import ( retry, stop_after_attempt, wait_random_exponential, ) def round_up(x, base=50): return int(math.ceil(x / 50)) * 50 @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def openai_backoff(**kwargs): return openai.ChatCompletion.create(**kwargs) def generate_documents(output_dir, prompts, verbose=True, force_regenerate=False): if not os.path.exists(f"{output_dir}/logprobs"): os.mkdir(f"{output_dir}/logprobs") if verbose: print("Generating Articles...") for idx, prompt in (enumerate(tqdm.tqdm(prompts)) if verbose else enumerate(prompts)): if os.path.exists(f"{output_dir}/{idx}.txt") and not force_regenerate: continue response = openai_backoff( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": prompt, } ], ) reply = response["choices"][0]["message"]["content"].strip() with open(f"{output_dir}/{idx}.txt", "w") as f: f.write(f"{reply}") if verbose: print("Writing logprobs...") for idx, prompt in (enumerate(tqdm.tqdm(prompts)) if verbose else enumerate(prompts)): with open(f"{output_dir}/{idx}.txt") as f: doc = f.read().strip() if not os.path.exists(f"{output_dir}/logprobs/{idx}-davinci.txt") and not force_regenerate: write_logprobs( doc, f"{output_dir}/logprobs/{idx}-davinci.txt", "davinci" ) if not os.path.exists(f"{output_dir}/logprobs/{idx}-ada.txt") and not force_regenerate: write_logprobs( doc, f"{output_dir}/logprobs/{idx}-curie.txt", "ada" ) ================================================ FILE: utils/load.py ================================================ from dataclasses import dataclass import tqdm import numpy as np import os DIR_IGNORE = {"logprobs", "prompts", "headlines"} @dataclass class Dataset: type: str path: str def get_generate_dataset_normal(path: str, verbose=False): files = [] to_iter = tqdm.tqdm(os.listdir(path)) if verbose else os.listdir(path) for file in to_iter: if file in DIR_IGNORE: continue files.append(f"{path}/{file}") return files def get_generate_dataset_author(path: str, author: str, verbose=False): files = [] if author is None: authors = sorted(os.listdir(path)) else: authors = [author] to_iter = tqdm.tqdm(authors) if verbose else authors for author in to_iter: for file in sorted(os.listdir(f"{path}/{author}")): if file in DIR_IGNORE: continue files.append(f"{path}/{author}/{file}") return files def get_generate_dataset(*datasets: Dataset): def generate_dataset(featurize, split=None, verbose=False, author=None): files = [] for dataset in datasets: if dataset.type == "normal": files += get_generate_dataset_normal(dataset.path) elif dataset.type == "author": files += get_generate_dataset_author(dataset.path, author=author) if split is not None: files = np.array(files)[split] data = [] files = tqdm.tqdm(files) if verbose else files for file in files: if "logprobs" in file: continue data.append(featurize(file)) return np.array(data) return generate_dataset ================================================ FILE: utils/n_gram.py ================================================ import tqdm from collections import defaultdict, Counter class NGramModel: """ An n-gram model, where alpha is the laplace smoothing parameter. """ def __init__(self, train_text, n=2, alpha=3e-3, vocab_size=None): self.n = n if vocab_size is None: # Assume GPT tokenizer self.vocab_size = 50257 self.smoothing = alpha self.smoothing_f = alpha * self.vocab_size self.c = defaultdict(lambda: [0, Counter()]) for i in tqdm.tqdm(range(len(train_text)-n)): n_gram = tuple(train_text[i:i+n]) self.c[n_gram[:-1]][1][n_gram[-1]] += 1 self.c[n_gram[:-1]][0] += 1 self.n_size = len(self.c) def n_gram_probability(self, n_gram): assert len(n_gram) == self.n it = self.c[tuple(n_gram[:-1])] prob = (it[1][n_gram[-1]] + self.smoothing)/(it[0] + self.smoothing_f) return prob class DiscountBackoffModel(NGramModel): """ An n-gram model with discounting and backoff. Delta is the discounting parameter. """ def __init__(self, train_text, lower_order_model, n=2, delta=0.9): super().__init__(train_text, n=n) self.lower_order_model = lower_order_model self.discount = delta def n_gram_probability(self, n_gram): assert len(n_gram) == self.n it = self.c[tuple(n_gram[:-1])] if it[0] == 0: return self.lower_order_model.n_gram_probability(n_gram[1:]) prob = self.discount * \ (len(it[1])/it[0]) * \ self.lower_order_model.n_gram_probability(n_gram[1:]) if it[1][n_gram[-1]] != 0: prob += max(it[1][n_gram[-1]] - self.discount, 0) / it[0] return prob class KneserNeyBaseModel(NGramModel): """ A Kneser-Ney base model, where n=1. """ def __init__(self, train_text, vocab_size=None): super().__init__(train_text, n=1, vocab_size=vocab_size) base_cnt = defaultdict(set) for i in range(1, len(train_text)): base_cnt[train_text[i]].add(train_text[i-1]) cnt = 0 for word in base_cnt: cnt += len(base_cnt[word]) self.prob = defaultdict(float) for word in base_cnt: self.prob[word] = len(base_cnt[word]) / cnt def n_gram_probability(self, n_gram): assert len(n_gram) == 1 ret_prob = self.prob[n_gram[0]] if ret_prob == 0: return 1 / self.vocab_size else: return ret_prob class TrigramBackoff: """ A trigram model with discounting and backoff. Uses a Kneser-Ney base model. """ def __init__(self, train_text, delta=0.9): self.base = KneserNeyBaseModel(train_text) self.bigram = DiscountBackoffModel( train_text, self.base, n=2, delta=delta) self.trigram = DiscountBackoffModel( train_text, self.bigram, n=3, delta=delta) def n_gram_probability(self, n_gram): assert len(n_gram) == 3 return self.trigram.n_gram_probability(n_gram) ================================================ FILE: utils/score.py ================================================ import numpy as np from sklearn.linear_model import LogisticRegression from torch.utils.data import random_split def k_fold_score(X, labels, indices=None, k=8, precision=10): if indices is None: indices = np.arange(X.shape[0]) splits = [len(indices) // k] * k splits[-1] += len(indices) % k k_split = random_split(indices, splits) score_sum = 0 for i in range(k): train = np.concatenate([np.array(k_split[j]) for j in range(k) if i != j]) model = LogisticRegression(C=10, penalty="l2", max_iter=1000) model.fit(X[train], labels[train]) score_sum += model.score(X[k_split[i]], labels[k_split[i]]) return round(score_sum / k, precision) ================================================ FILE: utils/symbolic.py ================================================ from nltk.util import ngrams from nltk.corpus import brown from nltk.tokenize import word_tokenize import tqdm import numpy as np import tiktoken import dill as pickle from utils.featurize import * from utils.n_gram import * from collections import Counter, defaultdict from sklearn.linear_model import LogisticRegression vec_functions = { "v-add": lambda a, b: a + b, "v-sub": lambda a, b: a - b, "v-mul": lambda a, b: a * b, "v-div": lambda a, b: np.divide( a, b, out=np.zeros_like(a), where=(b != 0), casting="unsafe" ), "v->": lambda a, b: a > b, "v-<": lambda a, b: a < b, } scalar_functions = { "s-max": max, "s-min": min, "s-avg": lambda x: sum(x) / len(x), "s-avg-top-25": lambda x: sum(sorted(x, reverse=True)[:25]) / len(sorted(x, reverse=True)[:25]), "s-len": len, "s-var": np.var, "s-l2": np.linalg.norm, } vectors = ["davinci-logprobs", "ada-logprobs", "trigram-logprobs", "unigram-logprobs"] # Get vec_combinations vec_combinations = defaultdict(list) for vec1 in range(len(vectors)): for vec2 in range(vec1): for func in vec_functions: if func != "v-div": vec_combinations[vectors[vec1]].append(f"{func} {vectors[vec2]}") for vec1 in vectors: for vec2 in vectors: if vec1 != vec2: vec_combinations[vec1].append(f"v-div {vec2}") def get_words(exp): """ Splits up expression into words, to be individually processed """ return exp.split(" ") def backtrack_functions( vectors=( "davinci-logprobs", "ada-logprobs", "trigram-logprobs", "unigram-logprobs", ), max_depth=2, ): """ Backtrack all possible features. """ def helper(prev, depth): if depth >= max_depth: return [] all_funcs = [] prev_word = get_words(prev)[-1] for func in scalar_functions: all_funcs.append(f"{prev} {func}") for comb in vec_combinations[prev_word]: all_funcs += helper(f"{prev} {comb}", depth + 1) return all_funcs ret = [] for vec in vectors: ret += helper(vec, 0) return ret def train_trigram(verbose=True, return_tokenizer=False): """ Trains and returns a trigram model on the brown corpus """ enc = tiktoken.encoding_for_model("davinci") tokenizer = enc.encode vocab_size = enc.n_vocab # We use the brown corpus to train the n-gram model sentences = brown.sents() if verbose: print("Tokenizing corpus...") tokenized_corpus = [] for sentence in tqdm.tqdm(sentences): tokens = tokenizer(" ".join(sentence)) tokenized_corpus += tokens if verbose: print("\nTraining n-gram model...") if return_tokenizer: return TrigramBackoff(tokenized_corpus), tokenizer else: return TrigramBackoff(tokenized_corpus) def get_all_logprobs( generate_dataset, preprocess=lambda x: x.strip(), verbose=True, trigram=None, tokenizer=None, num_tokens=2047, ): if trigram is None: trigram, tokenizer = train_trigram(verbose=verbose, return_tokenizer=True) davinci_logprobs, ada_logprobs = {}, {} trigram_logprobs, unigram_logprobs = {}, {} if verbose: print("Loading logprobs into memory") file_names = generate_dataset(lambda file: file, verbose=False) to_iter = tqdm.tqdm(file_names) if verbose else file_names for file in to_iter: if "logprobs" in file: continue with open(file, "r") as f: doc = preprocess(f.read()) davinci_logprobs[file] = get_logprobs( convert_file_to_logprob_file(file, "davinci") )[:num_tokens] ada_logprobs[file] = get_logprobs(convert_file_to_logprob_file(file, "ada"))[ :num_tokens ] trigram_logprobs[file] = score_ngram(doc, trigram, tokenizer, n=3)[:num_tokens] unigram_logprobs[file] = score_ngram(doc, trigram.base, tokenizer, n=1)[ :num_tokens ] return davinci_logprobs, ada_logprobs, trigram_logprobs, unigram_logprobs def generate_symbolic_data( generate_dataset, preprocess=lambda x: x, max_depth=2, output_file="symbolic_data", verbose=True, vector_map=None, ): """ Brute forces and generates symbolic data from a dataset of text files. """ if vector_map is None: ( davinci_logprobs, ada_logprobs, trigram_logprobs, unigram_logprobs, ) = get_all_logprobs(generate_dataset, preprocess=preprocess, verbose=verbose) vector_map = { "davinci-logprobs": lambda file: davinci_logprobs[file], "ada-logprobs": lambda file: ada_logprobs[file], "trigram-logprobs": lambda file: trigram_logprobs[file], "unigram-logprobs": lambda file: unigram_logprobs[file], } all_funcs = backtrack_functions(max_depth=max_depth) if verbose: print(f"\nTotal # of Features: {len(all_funcs)}.") print("Sampling 5 features:") for i in range(5): print(all_funcs[np.random.randint(0, len(all_funcs))]) print("\nGenerating datasets...") def calc_features(file, exp): exp_tokens = get_words(exp) curr = vector_map[exp_tokens[0]](file) for i in range(1, len(exp_tokens)): if exp_tokens[i] in vec_functions: next_vec = vector_map[exp_tokens[i + 1]](file) curr = vec_functions[exp_tokens[i]](curr, next_vec) elif exp_tokens[i] in scalar_functions: return scalar_functions[exp_tokens[i]](curr) exp_to_data = {} for exp in tqdm.tqdm(all_funcs): exp_to_data[exp] = generate_dataset( lambda file: calc_features(file, exp) ).reshape(-1, 1) pickle.dump(exp_to_data, open(output_file, "wb")) def get_exp_featurize(best_features, vector_map): def calc_features(file, exp): exp_tokens = get_words(exp) curr = vector_map[exp_tokens[0]](file) for i in range(1, len(exp_tokens)): if exp_tokens[i] in vec_functions: next_vec = vector_map[exp_tokens[i + 1]](file) curr = vec_functions[exp_tokens[i]](curr, next_vec) elif exp_tokens[i] in scalar_functions: return scalar_functions[exp_tokens[i]](curr) def exp_featurize(file): return np.array([calc_features(file, exp) for exp in best_features]) return exp_featurize ================================================ FILE: utils/write_logprobs.py ================================================ import openai import json import numpy as np import tiktoken import torch import torch.nn.functional as F from transformers import AutoTokenizer tokenizer = tiktoken.encoding_for_model("davinci") llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab_map = {} vocab = llama_tokenizer.vocab for token in vocab: idx = vocab[token] vocab_map[idx] = token def write_logprobs(text, file, model): """ Run text under model and write logprobs to file, separated by newline. """ tokens = tokenizer.encode(text) doc = tokenizer.decode(tokens[:2047]) response = openai.Completion.create( model=model, prompt="<|endoftext|>" + doc, max_tokens=0, echo=True, logprobs=1, ) subwords = response["choices"][0]["logprobs"]["tokens"][1:] subprobs = response["choices"][0]["logprobs"]["token_logprobs"][1:] gpt2_map = {"\n": "Ċ", "\t": "ĉ", " ": "Ġ"} for i in range(len(subwords)): for k, v in gpt2_map.items(): subwords[i] = subwords[i].replace(k, v) to_write = "" for _, (w, p) in enumerate(zip(subwords, subprobs)): to_write += f"{w} {-p}\n" with open(file, "w") as f: f.write(to_write) def write_llama_logprobs(text, file, model): with torch.no_grad(): encodings = llama_tokenizer(text, return_tensors="pt").to(device) logits = F.softmax(model(encodings["input_ids"]).logits, dim=2) tokens = encodings["input_ids"] indices = torch.tensor([[[i] for i in tokens[0]]])[:, 1:, :].to(device) subwords = [vocab_map[int(idx)] for idx in encodings["input_ids"][0][1:]] subprobs = ( torch.gather(logits[:, :-1, :], dim=2, index=indices) .flatten() .cpu() .detach() .numpy() ) to_write = "" for _, (w, p) in enumerate(zip(subwords, subprobs)): to_write += f"{w} {-np.log(p)}\n" with open(file, "w") as f: f.write(to_write)