Repository: FoundationAgents/ReCode Branch: main Commit: 6e7223f71281 Files: 74 Total size: 16.3 MB Directory structure: gitextract_bghxqcon/ ├── .gitignore ├── LICENSE ├── README.md ├── agents/ │ └── recode/ │ ├── agent.py │ ├── resources/ │ │ ├── fewshots/ │ │ │ ├── alfworld/ │ │ │ │ ├── clean.txt │ │ │ │ ├── cool.txt │ │ │ │ ├── examine.txt │ │ │ │ ├── heat.txt │ │ │ │ ├── put.txt │ │ │ │ └── puttwo.txt │ │ │ ├── sciworld/ │ │ │ │ └── base.txt │ │ │ └── webshop/ │ │ │ └── base.txt │ │ └── prompts/ │ │ ├── alfworld/ │ │ │ └── actions.txt │ │ ├── default_new.py │ │ ├── sciworld/ │ │ │ └── actions.txt │ │ └── webshop/ │ │ └── actions.txt │ └── utils.py ├── base/ │ ├── agent.py │ └── environment.py ├── configs/ │ ├── prices.json │ └── profiles_example.yaml ├── envs/ │ ├── alfworld/ │ │ ├── base_config.yaml │ │ └── env.py │ ├── sciworld/ │ │ ├── base_config.yaml │ │ ├── data/ │ │ │ ├── max_steps.json │ │ │ ├── taskname2id.json │ │ │ ├── test_indices.json │ │ │ ├── train_indices.json │ │ │ └── valid_indices.json │ │ └── env.py │ └── webshop/ │ ├── env.py │ ├── setup.py │ ├── setup.sh │ └── src/ │ └── webshop/ │ ├── __init__.py │ ├── run_envs/ │ │ ├── run_web_agent_site_env.py │ │ └── run_web_agent_text_env.py │ ├── search_engine/ │ │ └── lucene_searcher.py │ ├── transfer/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── app.py │ │ ├── predict_help.py │ │ └── webshop_lite.py │ └── web_agent_site/ │ ├── __init__.py │ ├── app.py │ ├── attributes/ │ │ ├── annotate.py │ │ └── generate_attrs.py │ ├── engine/ │ │ ├── __init__.py │ │ ├── engine.py │ │ ├── goal.py │ │ └── normalize.py │ ├── envs/ │ │ ├── __init__.py │ │ ├── chromedriver │ │ ├── web_agent_site_env.py │ │ └── web_agent_text_env.py │ ├── models/ │ │ ├── __init__.py │ │ └── models.py │ ├── static/ │ │ └── style.css │ ├── templates/ │ │ ├── attributes_page.html │ │ ├── description_page.html │ │ ├── done_page.html │ │ ├── features_page.html │ │ ├── item_page.html │ │ ├── results_page.html │ │ ├── review_page.html │ │ └── search_page.html │ └── utils.py ├── requirements.txt ├── run.py └── utils/ ├── common.py ├── errors.py ├── executor.py ├── llm.py ├── logger.py └── mockllm.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .vscode/ __pycache__/ .DS_Store *.pyc *.zip logs/ envs/webshop/data/ envs/webshop/search_index/ envs/webshop/data.zip envs/webshop/indexes.zip profiles.yaml ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2026 Foundation Agents Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # ReCode: Unify Plan and Action for Universal Granularity Control [](https://arxiv.org/abs/2510.23564) > If you encounter any difficulties in using or reproducing the code, please contact me at [zhaoyangyu713@gmail.com](mailto:zhaoyangyu713@gmail.com). ReCode introduces recursive code generation for LLM agents, unifying plan and action into a single representation. By treating high-level plans as placeholder functions that recursively decompose into executable primitives, it achieves universal granularity control and dynamically adapts from strategic thinking to concrete actions. This repository hosts the reference implementation used in the paper, along with environment wrappers and experiment tooling.
## Core Idea ReCode adopts a divide-and-conquer strategy, decomposing complex tasks into executable code fragments: 1. **Tree-structured code**: Organizes partial programs in a tree where each node captures one sub-task and records its execution trace. 2. **Recursive expansion**: Placeholder functions are expanded by the LLM into more specific calls or smaller subroutines using environment-specific prompts and few-shots. 3. **Dynamic execution loop**: Each node is executed immediately; fresh observations decide whether to expand further, retry, or finish. 4. **Shared executor state**: A constrained Python executor maintains environment variables, validates code blocks, and exposes the toolset available to the agent. ## Repository Layout - `run.py` – CLI entry point that instantiates agents/envs, manages concurrency, and writes run summaries. - `agents/recode/` – ReCode agent implementation, prompt templates, and utility helpers. - `envs/` – Environment wrappers and assets for `alfworld`, `webshop`, and `sciworld`. - `configs/` – LLM profile templates and (expected) pricing metadata used by the async client. - `utils/` – Shared components: async OpenAI wrapper, constrained executor, logging helpers, error types. - `figures/` – Paper figures used throughout this README. ## Experiments To evaluate the effectiveness of ReCode, we divide our experiments into the inference part and the training part. 1. **Inference Result**: we compare against several mainstream paradigm (ReAct, CodeAct) and some of the work focused on improving LLM-based agent planning (AdaPlanner and ADaPT). ReCode achieved significant performance improvements across all three environments, with an average score of 60.8, surpassing the best baseline method by 10.5 (relative 20.9%). _With our tests, ReCode can achieve a perfect **100** score in ALFWorld under `claude-4-sonnet`._ 2. **Training Result**: we conduct supervised fine-tuning (SFT) on ReCode, ReAct and CodeAct with `Qwen2.5-7B-Instruct`. ReCode+SFT delivers an impressive average performance of 70.4% across all environments, outperforming both ReAct+SFT (67.6%) and CodeAct+SFT (55.8%), highlighting its exceptional data efficiency. ## Quick Start To run ReCode, we need a conda environment. The python version should be 3.10 or newer. Then, it is necessary to configure dependencies for three environments (it has not been confirmed whether conflicts will arise in the same environment), and we suggest configuring them in three separate environments. ```bash conda create -n recode-envname python=3.10 # Replace "envname" with the your environment name. conda activate recode-envname ``` --- ### ALFWorld - Follow the [ALFWorld instructions](https://github.com/alfworld/alfworld). - Set `ALFWORLD_DATA` to the dataset root or edit `envs/alfworld/base_config.yaml` to point to your local paths: ```bash export ALFWORLD_DATA=/path/to/alfworld ``` ### ScienceWorld - Follow the instruciton from the [ScienceWorld repository](https://github.com/allenai/ScienceWorld). ### WebShop Thanks to [ETO ](https://github.com/Yifan-Song793/ETO) for providing a convenient script to configure WebShop environment. ```bash cd envs/webshop pip install -e . conda install -y -c conda-forge openjdk=11 pip install "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl" ``` Run the provided helper to fetch the goal set and pre-built search index: ```bash # The current path is "envs/webshop" bash setup.sh ``` --- Install some other dependencies. ```bash pip install -r requirements.txt # Here may not be complete, please contact me promptly if you encounter any problems ``` Ensure `configs/profiles.yaml` points to a valid API credential (copy `configs/profiles_example.yaml` if you need a template), then run a short dry run in any enabled environment: ```bash python run.py -a recode -e alfworld -n 1 --split test --profile default ``` Replace `alfworld` with `webshop` or `sciworld` once their assets are available. Logs are written to `logs/
The contents of this directory each serve the following purposes:
* `app.py`: Run to launch interactive [Gradio](https://gradio.app/) demo of app
* `predict_help.py`: Amazon, eBay web scraping code
* `webshop_lite.py`: A condensed version of WebShop's templating engine
If you are interested in *transferring an agent's functionality to an new website or platform*, you will need to...
1. implement two new functions: `parse_results_To learn more about this project, check out the project page!
", description="Sim-to-real transfer of agent trained on WebShop to search a desired product on Amazon from any natural language query!
", ).launch(inline=False) ================================================ FILE: envs/webshop/src/webshop/transfer/predict_help.py ================================================ from bs4 import BeautifulSoup from bs4.element import Comment from enum import Enum import re, time from urllib.parse import urlencode import json, requests, torch class Page(Enum): DESC = "description" FEATURES = "features" ITEM_PAGE = "item_page" RESULTS = "results" REVIEWS = "reviews" SEARCH = "search" SUB_PAGE = "item_sub_page" HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36' DEBUG_HTML = "temp.html" NUM_PROD_LIMIT = 10 WEBSHOP_URL = "http://3.83.245.205:3000" WEBSHOP_SESSION = "abc" def parse_results_ebay(query, page_num=None, verbose=True): query_string = '+'.join(query.split()) page_num = 1 if page_num is None else page_num url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}' if verbose: print(f"Search Results URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.text, 'html.parser') products = soup.select('.s-item__wrapper.clearfix') results = [] for item in products[:NUM_PROD_LIMIT]: title = item.select_one('.s-item__title').text.strip() if "shop on ebay" in title.lower(): # Skip "Shop on ebay" product title continue link = item.select_one('.s-item__link')['href'] asin = link.split("?")[0][len("https://www.ebay.com/itm/"):] try: price = item.select_one('.s-item__price').text if "to" in price: prices = price.split(" to ") price = [p.strip("$") for p in prices] except: price = None results.append({ "asin": asin, "Title": title, "Price": price }) if verbose: print(f"Scraped {len(results)} products") return results def parse_item_page_ebay(asin, verbose=True): product_dict = {} product_dict["asin"] = asin url = f"https://www.ebay.com/itm/{asin}" if verbose: print(f"Item Page URL: {url}") begin = time.time() webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) end = time.time() if verbose: print(f"Item page scraping took {end-begin} seconds") soup = BeautifulSoup(webpage.content, "html.parser") # Title try: product_dict["Title"] = soup.find('h1', {'class': 'x-item-title__mainTitle'}).text.strip() except: product_dict["Title"] = "N/A" # Price: Get price string, extract decimal numbers from string try: price_str = soup.find('div', {'class': 'mainPrice'}).text prices = re.findall('\d*\.?\d+', price_str) product_dict["Price"] = prices[0] except: product_dict["Price"] = "N/A" # Main Image try: img_div = soup.find('div', {'id': 'mainImgHldr'}) img_link = img_div.find('img', {'id': 'icImg'})["src"] product_dict["MainImage"] = img_link except: product_dict["MainImage"] = "" # Rating try: rating = soup.find('span', {'class': 'reviews-star-rating'})["title"].split()[0] except: rating = None product_dict["Rating"] = rating # Options options, options_to_images = {}, {} # TODO: options_to_images possible? try: option_blocks = soup.findAll('select', {'class': 'msku-sel'}) for block in option_blocks: name = block["name"].strip().strip(":") option_tags = block.findAll("option") opt_list = [] for option_tag in option_tags: if "select" not in option_tag.text.lower(): # Do not include "- select -" (aka `not selected`) choice opt_list.append(option_tag.text) options[name] = opt_list except: options = {} product_dict["options"], product_dict["option_to_image"] = options, options_to_images # Description desc = None try: # Ebay descriptions are shown in `iframe`s desc_link = soup.find('iframe', {'id': 'desc_ifr'})["src"] desc_webpage = requests.get(desc_link, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) desc_soup = BeautifulSoup(desc_webpage.content, "html.parser") desc = ' '.join(desc_soup.text.split()) except: desc = "N/A" product_dict["Description"] = desc # Features features = None try: features = soup.find('div', {'class': 'x-about-this-item'}).text except: features = "N/A" product_dict["BulletPoints"] = features return product_dict def parse_results_ws(query, page_num=None, verbose=True): query_string = '+'.join(query.split()) page_num = 1 if page_num is None else page_num url = ( f'{WEBSHOP_URL}/search_results/{WEBSHOP_SESSION}/' f'{query_string}/{page_num}' ) if verbose: print(f"Search Results URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.content, 'html.parser') products = soup.findAll('div', {'class': 'list-group-item'}) results = [] for product in products: asin = product.find('a', {'class': 'product-link'}) title = product.find('h4', {'class': 'product-title'}) price = product.find('h5', {'class': 'product-price'}) if "\n" in title: title = title.text.split("\n")[0].strip() else: title = title.text.strip().strip("\n") if "to" in price.text: # Parse if price presented as range prices = price.text.split(" to ") price = [float(p.strip().strip("\n$")) for p in prices] else: price = float(price.text.strip().strip("\n$")) results.append({ "asin": asin.text, "Title": title, "Price": price }) if verbose: print(f"Scraped {len(results)} products") return results def parse_item_page_ws(asin, query, page_num, options, verbose=True): product_dict = {} product_dict["asin"] = asin query_string = '+'.join(query.split()) options_string = json.dumps(options) url = ( f'{WEBSHOP_URL}/item_page/{WEBSHOP_SESSION}/' f'{asin}/{query_string}/{page_num}/{options_string}' ) if verbose: print(f"Item Page URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.content, 'html.parser') # Title, Price, Rating, and MainImage product_dict["Title"] = soup.find('h2').text h4_headers = soup.findAll("h4") for header in h4_headers: text = header.text if "Price" in text: product_dict["Price"] = text.split(":")[1].strip().strip("$") elif "Rating" in text: product_dict["Rating"] = text.split(":")[1].strip() product_dict["MainImage"] = soup.find('img')['src'] # Options options, options_to_image = {}, {} option_blocks = soup.findAll("div", {'class': 'radio-toolbar'}) for block in option_blocks: name = block.find("input")["name"] labels = block.findAll("label") inputs = block.findAll("input") opt_list = [] for label, input in zip(labels, inputs): opt = label.text opt_img_path = input["onclick"].split("href=")[1].strip('\';') opt_img_url = f'{WEBSHOP_URL}{opt_img_path}' opt_list.append(opt) options_to_image[opt] = opt_img_url options[name] = opt_list product_dict["options"] = options product_dict["option_to_image"] = options_to_image # Description url = ( f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/' f'{asin}/{query_string}/{page_num}/Description/{options_string}' ) if verbose: print(f"Item Description URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.content, 'html.parser') product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip() # Features url = ( f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/' f'{asin}/{query_string}/{page_num}/Features/{options_string}' ) if verbose: print(f"Item Features URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.content, 'html.parser') bullets = soup.find(name="ul").findAll(name="li") product_dict["BulletPoints"] = '\n'.join([b.text.strip() for b in bullets]) return product_dict # Query -> Search Result ASINs def parse_results_amz(query, page_num=None, verbose=True): url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+") if page_num is not None: url += "&page=" + str(page_num) if verbose: print(f"Search Results URL: {url}") webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) soup = BeautifulSoup(webpage.content, 'html.parser') products = soup.findAll('div', {'data-component-type': 's-search-result'}) if products is None: temp = open(DEBUG_HTML, "w") temp.write(str(soup)) temp.close() raise Exception("Couldn't find search results page, outputted html for inspection") results = [] for product in products[:NUM_PROD_LIMIT]: asin = product['data-asin'] title = product.find("h2", {'class': "a-size-mini"}) price_div = product.find("div", {'class': 's-price-instructions-style'}) price = price_div.find("span", {'class': 'a-offscreen'}) result = { 'asin': asin, 'Title': title.text.strip(), 'Price': price.text.strip().strip("$") } results.append(result) if verbose: print("Scraped", len(results), "products") return results # Scrape information of each product def parse_item_page_amz(asin, verbose=True): product_dict = {} product_dict["asin"] = asin url = f"https://www.amazon.com/dp/{asin}" if verbose: print("Item Page URL:", url) begin = time.time() webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'}) end = time.time() if verbose: print(f"Item page scraping took {end-begin} seconds") soup = BeautifulSoup(webpage.content, "html.parser") # Title try: title = soup.find("span", attrs={"id": 'productTitle'}) title = title.string.strip().replace(',', '') except AttributeError: title = "N/A" product_dict["Title"] = title # Price try: parent_price_span = soup.find(name="span", class_="apexPriceToPay") price_span = parent_price_span.find(name="span", class_="a-offscreen") price = float(price_span.getText().replace("$", "")) except AttributeError: price = "N/A" product_dict["Price"] = price # Rating try: rating = soup.find(name="span", attrs={"id": "acrPopover"}) if rating is None: rating = "N/A" else: rating = rating.text except AttributeError: rating = "N/A" product_dict["Rating"] = rating.strip("\n").strip() # Features try: features = soup.find(name="div", attrs={"id": "feature-bullets"}).text except AttributeError: features = "N/A" product_dict["BulletPoints"] = features # Description try: desc_body = soup.find(name="div", attrs={"id": "productDescription_feature_div"}) desc_div = desc_body.find(name="div", attrs={"id": "productDescription"}) desc_ps = desc_div.findAll(name="p") desc = " ".join([p.text for p in desc_ps]) except AttributeError: desc = "N/A" product_dict["Description"] = desc.strip() # Main Image try: imgtag = soup.find("img", {"id":"landingImage"}) imageurl = dict(imgtag.attrs)["src"] except AttributeError: imageurl = "" product_dict["MainImage"] = imageurl # Options options, options_to_image = {}, {} try: option_body = soup.find(name='div', attrs={"id": "softlinesTwister_feature_div"}) if option_body is None: option_body = soup.find(name='div', attrs={"id": "twister_feature_div"}) option_blocks = option_body.findAll(name='ul') for block in option_blocks: name = json.loads(block["data-a-button-group"])["name"] # Options opt_list = [] for li in block.findAll("li"): img = li.find(name="img") if img is not None: opt = img["alt"].strip() opt_img = img["src"] if len(opt) > 0: options_to_image[opt] = opt_img else: opt = li.text.strip() if len(opt) > 0: opt_list.append(opt) options[name.replace("_name", "").replace("twister_", "")] = opt_list except AttributeError: options = {} product_dict["options"], product_dict["option_to_image"] = options, options_to_image return product_dict # Get text observation from html # TODO[john-b-yang]: Similar to web_agent_site/envs/...text_env.py func def, merge? def convert_html_to_text(html, simple=False, clicked_options=None, visited_asins=None): def tag_visible(element): ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'} return ( element.parent.name not in ignore and not isinstance(element, Comment) ) html_obj = BeautifulSoup(html, 'html.parser') texts = html_obj.find_all(string=True) visible_texts = filter(tag_visible, texts) if simple: return ' [SEP] '.join(t.strip() for t in visible_texts if t != '\n') else: observation = '' for t in visible_texts: if t == '\n': continue if t.parent.name == 'button': # button processed_t = f'[button] {t} [button]' elif t.parent.name == 'label': # options if f'{t}' in clicked_options: processed_t = f' [clicked button] {t} [clicked button]' observation = f'You have clicked {t}.\n' + observation else: processed_t = f' [button] {t} [button]' elif t.parent.get('class') == ["product-link"]: # asins if f'{t}' in visited_asins: processed_t = f'\n[clicked button] {t} [clicked button]' else: processed_t = f'\n[button] {t} [button]' else: # regular, unclickable text processed_t = str(t) observation += processed_t + '\n' return observation # Get action from dict of values retrieved from html def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None) -> dict: info = {"valid": []} if page_type == Page.RESULTS: info["valid"] = ['click[back to search]'] if products is None or page_num is None: print(page_num) print(products) raise Exception('Provide `products`, `page_num` to get `results` valid actions') # Decide whether to add `next >` as clickable based on # of search results if len(products) > 10: info["valid"].append('click[next >]') # Add `< prev` as clickable if not first page of search results if page_num > 1: info["valid"].append('click[< prev]') for product in products: info["valid"].append("click[item - " + product["Title"] + "]") if page_type == Page.ITEM_PAGE: if products is None or asin is None: raise Exception('Provide `products` and `asin` to get `item_page` valid actions') info["valid"] = ['click[back to search]', 'click[< prev]', 'click[description]',\ 'click[features]', 'click[buy now]'] # To do: reviews if "options" in products[asin]: for key, values in products[asin]["options"].items(): for value in values: info["valid"].append("click[" + value + "]") if page_type == Page.SUB_PAGE: info["valid"] = ['click[back to search]', 'click[< prev]'] info['image_feat'] = torch.zeros(512) return info ================================================ FILE: envs/webshop/src/webshop/transfer/webshop_lite.py ================================================ import os from flask import render_template_string, Flask from .predict_help import Page app=Flask(__name__) app.debug=True SESSION_ID = "ABC" TEMPLATE_DIR = "envs/webshop/src/webshop/web_agent_site/templates/" KEYWORDS = ["placeholder (not needed)"] # To Do: Does this matter? QUERY = "" product_map = {} def read_html_template(path): with open(path) as f: template = f.read() return template @app.route('/', methods=['GET', 'POST']) def index(session_id, **kwargs): print("Hello world") @app.route('/', methods=['GET', 'POST']) def search_results(data): path = os.path.join(TEMPLATE_DIR, 'results_page.html') html = render_template_string( read_html_template(path=path), session_id=SESSION_ID, products=data, keywords=KEYWORDS, page=1, total=len(data), instruction_text=QUERY, ) return html @app.route('/', methods=['GET', 'POST']) def item_page(session_id, asin, keywords, page, options): path = os.path.join(TEMPLATE_DIR, 'item_page.html') html = render_template_string( read_html_template(path=path), session_id=session_id, product_info=product_map[asin], keywords=keywords, page=page, asin=asin, options=options, instruction_text=QUERY ) return html @app.route('/', methods=['GET', 'POST']) def item_sub_page(session_id, asin, keywords, page, sub_page, options): path = os.path.join(TEMPLATE_DIR, sub_page.value.lower() + "_page.html") html = render_template_string( read_html_template(path), session_id=session_id, product_info=product_map[asin], keywords=keywords, page=page, asin=asin, options=options, instruction_text=QUERY ) return html @app.route('/', methods=['GET', 'POST']) def done(asin, options, session_id, **kwargs): path = os.path.join(TEMPLATE_DIR, 'done_page.html') html = render_template_string( read_html_template(path), session_id=session_id, reward=1, asin=asin, options=product_map[asin]["options"], reward_info=kwargs.get('reward_info'), goal_attrs=kwargs.get('goal_attrs'), purchased_attrs=kwargs.get('purchased_attrs'), goal=kwargs.get('goal'), mturk_code=kwargs.get('mturk_code'), query=kwargs.get('query'), category=kwargs.get('category'), product_category=kwargs.get('product_category'), ) return html # Project Dictionary Information onto Fake Amazon def dict_to_fake_html(data, page_type, asin=None, sub_page_type=None, options=None, prod_map={}, query=""): global QUERY, product_map QUERY = query product_map = prod_map with app.app_context(), app.test_request_context(): if page_type == Page.RESULTS: return search_results(data) if page_type == Page.ITEM_PAGE: return item_page(SESSION_ID, asin, KEYWORDS, 1, options) if page_type == Page.SUB_PAGE: if sub_page_type is not None: return item_sub_page(SESSION_ID, asin, KEYWORDS, 1, sub_page_type, options) else: raise Exception("Sub page of type", sub_page_type, "unrecognized") ================================================ FILE: envs/webshop/src/webshop/web_agent_site/__init__.py ================================================ ================================================ FILE: envs/webshop/src/webshop/web_agent_site/app.py ================================================ import argparse, json, logging, random from pathlib import Path from ast import literal_eval from flask import ( Flask, request, redirect, url_for ) from rich import print from .engine.engine import ( load_products, init_search_engine, convert_web_app_string_to_var, get_top_n_product_from_keywords, get_product_per_page, map_action_to_html, END_BUTTON ) from .engine.goal import get_reward, get_goals from .utils import ( generate_mturk_code, setup_logger, DEFAULT_FILE_PATH, DEBUG_PROD_SIZE, ) app = Flask(__name__) search_engine = None all_products = None product_item_dict = None product_prices = None attribute_to_asins = None goals = None weights = None user_sessions = dict() user_log_dir = None SHOW_ATTRS_TAB = False @app.route('/') def home(): return redirect(url_for('index', session_id="abc")) @app.route('/': query = ' '.join(keywords[1:]).strip() top_n_products = [p for p in all_products if p['query'] == query] else: keywords = ' '.join(keywords) hits = search_engine.search(keywords, k=SEARCH_RETURN_N) docs = [search_engine.doc(hit.docid) for hit in hits] top_n_asins = [json.loads(doc.raw())['id'] for doc in docs] top_n_products = [product_item_dict[asin] for asin in top_n_asins if asin in product_item_dict] return top_n_products def get_product_per_page(top_n_products, page): return top_n_products[(page - 1) * PRODUCT_WINDOW:page * PRODUCT_WINDOW] def generate_product_prices(all_products): product_prices = dict() for product in all_products: asin = product['asin'] pricing = product['pricing'] if not pricing: price = 100.0 elif len(pricing) == 1: price = pricing[0] else: price = random.uniform(*pricing[:2]) product_prices[asin] = price return product_prices def init_search_engine(num_products=None): if num_products == 100: indexes = 'indexes_100' elif num_products == 1000: indexes = 'indexes_1k' elif num_products == 100000: indexes = 'indexes_100k' elif num_products is None: indexes = 'indexes' else: raise NotImplementedError(f'num_products being {num_products} is not supported yet.') index_dir = os.path.abspath(os.path.join(BASE_DIR, f'../search_index/{indexes}')) assert os.path.isdir(index_dir), f'Index dir missing: {index_dir}' search_engine = LuceneSearcher(index_dir) # search_engine = LuceneSearcher(os.path.join(BASE_DIR, f'../search_index/indexes')) return search_engine def clean_product_keys(products, quiet: bool = False): for product in products: product.pop('product_information', None) product.pop('brand', None) product.pop('brand_url', None) product.pop('list_price', None) product.pop('availability_quantity', None) product.pop('availability_status', None) product.pop('total_reviews', None) product.pop('total_answered_questions', None) product.pop('seller_id', None) product.pop('seller_name', None) product.pop('fulfilled_by_amazon', None) product.pop('fast_track_message', None) product.pop('aplus_present', None) product.pop('small_description_old', None) if not quiet: print('Keys cleaned.') return products def load_products(filepath, num_products=None, human_goals=True, quiet: bool = False): # TODO: move to preprocessing step -> enforce single source of truth with open(filepath) as f: products = json.load(f) if not quiet: print('Products loaded.') products = clean_product_keys(products, quiet=quiet) # with open(DEFAULT_REVIEW_PATH) as f: # reviews = json.load(f) all_reviews = dict() all_ratings = dict() # for r in reviews: # all_reviews[r['asin']] = r['reviews'] # all_ratings[r['asin']] = r['average_rating'] if human_goals: with open(HUMAN_ATTR_PATH) as f: human_attributes = json.load(f) with open(DEFAULT_ATTR_PATH) as f: attributes = json.load(f) with open(HUMAN_ATTR_PATH) as f: human_attributes = json.load(f) if not quiet: print('Attributes loaded.') asins = set() all_products = [] attribute_to_asins = defaultdict(set) if num_products is not None: # using item_shuffle.json, we assume products already shuffled products = products[:num_products] for i, p in tqdm(enumerate(products), total=len(products), disable=quiet): asin = p['asin'] if asin == 'nan' or len(asin) > 10: continue if asin in asins: continue else: asins.add(asin) products[i]['category'] = p['category'] products[i]['query'] = p['query'] products[i]['product_category'] = p['product_category'] products[i]['Title'] = p['name'] products[i]['Description'] = p['full_description'] products[i]['Reviews'] = all_reviews.get(asin, []) products[i]['Rating'] = all_ratings.get(asin, 'N.A.') for r in products[i]['Reviews']: if 'score' not in r: r['score'] = r.pop('stars') if 'review' not in r: r['body'] = '' else: r['body'] = r.pop('review') products[i]['BulletPoints'] = p['small_description'] \ if isinstance(p['small_description'], list) else [p['small_description']] pricing = p.get('pricing') if pricing is None or not pricing: pricing = [100.0] price_tag = '$100.0' else: pricing = [ float(Decimal(re.sub(r'[^\d.]', '', price))) for price in pricing.split('$')[1:] ] if len(pricing) == 1: price_tag = f"${pricing[0]}" else: price_tag = f"${pricing[0]} to ${pricing[1]}" pricing = pricing[:2] products[i]['pricing'] = pricing products[i]['Price'] = price_tag options = dict() customization_options = p['customization_options'] option_to_image = dict() if customization_options: for option_name, option_contents in customization_options.items(): if option_contents is None: continue option_name = option_name.lower() option_values = [] for option_content in option_contents: option_value = option_content['value'].strip().replace('/', ' | ').lower() option_image = option_content.get('image', None) option_values.append(option_value) option_to_image[option_value] = option_image options[option_name] = option_values products[i]['options'] = options products[i]['option_to_image'] = option_to_image # without color, size, price, availability # if asin in attributes and 'attributes' in attributes[asin]: # products[i]['Attributes'] = attributes[asin]['attributes'] # else: # products[i]['Attributes'] = ['DUMMY_ATTR'] # products[i]['instruction_text'] = \ # attributes[asin].get('instruction', None) # products[i]['instruction_attributes'] = \ # attributes[asin].get('instruction_attributes', None) # without color, size, price, availability if asin in attributes and 'attributes' in attributes[asin]: products[i]['Attributes'] = attributes[asin]['attributes'] else: products[i]['Attributes'] = ['DUMMY_ATTR'] if human_goals: if asin in human_attributes: products[i]['instructions'] = human_attributes[asin] else: products[i]['instruction_text'] = \ attributes[asin].get('instruction', None) products[i]['instruction_attributes'] = \ attributes[asin].get('instruction_attributes', None) products[i]['MainImage'] = p['images'][0] products[i]['query'] = p['query'].lower().strip() all_products.append(products[i]) for p in all_products: for a in p['Attributes']: attribute_to_asins[a].add(p['asin']) product_item_dict = {p['asin']: p for p in all_products} product_prices = generate_product_prices(all_products) return all_products, product_item_dict, product_prices, attribute_to_asins ================================================ FILE: envs/webshop/src/webshop/web_agent_site/engine/goal.py ================================================ """ Functions for specifying goals and reward calculations. """ import itertools import random import spacy from collections import defaultdict from rich import print from thefuzz import fuzz from .normalize import normalize_color nlp = spacy.load("en_core_web_lg") PRICE_RANGE = [10.0 * i for i in range(1, 100)] def get_goals(all_products, product_prices, human_goals=True, quiet: bool = False): if human_goals: return get_human_goals(all_products, product_prices, quiet=quiet) else: return get_synthetic_goals(all_products, product_prices, quiet=quiet) def get_human_goals(all_products, product_prices, quiet: bool = False): goals = [] cnt_atts = defaultdict(int) cnt = 0 for item in all_products: asin = item['asin'] if 'instructions' not in item: continue for product in item['instructions']: attributes = product['instruction_attributes'] if len(attributes) == 0: cnt += 1 continue if product_prices is not None: price = product_prices[asin] price_range = [p for p in PRICE_RANGE if p > price][:4] if len(price_range) >= 2: _, price_upper = sorted(random.sample(price_range, 2)) price_text = \ f', and price lower than {price_upper:.2f} dollars' else: price_upper = 1000000 price_text = '' else: price_upper = 1000000 goals.append({ 'asin': asin, 'category': item['category'], 'query': item['query'], 'name': item['name'], 'product_category': item['product_category'], 'instruction_text': product['instruction'].strip('.') + price_text, 'attributes': attributes, 'price_upper': price_upper, 'goal_options': product['instruction_options'], }) for att in attributes: cnt_atts[att] += 1 # goals += product_goals for goal in goals: goal['weight'] = 1 if not quiet: print(len(all_products)) print("Number of Goals:", len(goals)) print(cnt, 'skipped') return goals def get_synthetic_goals(all_products, product_prices, quiet: bool = False): goals = [] cnt_atts = defaultdict(int) for product in all_products: if ('instruction_text' not in product or product['instruction_text'] is None): continue product_goals = [] asin = product['asin'] attributes = product['instruction_attributes'] assert len(attributes) > 0 if product_prices is not None: price = product_prices[asin] price_range = [p for p in PRICE_RANGE if p > price][:4] if len(price_range) >= 2: _, price_upper = sorted(random.sample(price_range, 2)) price_text = \ f', and price lower than {price_upper:.2f} dollars' else: price_upper = 1000000 price_text = '' else: price_upper = 1000000 price_text = '' instruction_text = product['instruction_text'] options = product['options'] option_names = sorted(options) combinations = list(itertools.product( *(options[option_name] for option_name in option_names) )) for combination in combinations: goal_options = dict() for i, o in enumerate(combination): # option_text.append(f'{option_names[i]}: {o}') goal_options[option_names[i]] = o option_text = ', and '.join([ f'{k}: {v}' for k, v in goal_options.items() ]) option_text = ' with ' + option_text if option_text else '' product_goals.append({ 'asin': asin, 'category': product['category'], 'query': product['query'], 'name': product['name'], 'product_category': product['product_category'], 'instruction_text': f'{instruction_text}{option_text}{price_text}', 'attributes': attributes, 'price_upper': price_upper, 'goal_options': goal_options, 'name': product['Title'], }) for att in attributes: cnt_atts[att] += 1 goals += product_goals for goal in goals: goal['weight'] = sum(1. / cnt_atts[att] for att in goal['attributes']) / len(goal['attributes']) return goals def get_type_reward(purchased_product, goal): """Determines the type reward - captures whether chosen product is in the same category""" query_match = purchased_product['query'] == goal['query'] # Check number of unique categories that match, ignoring order purchased_product_category = [x.strip() for x in purchased_product['product_category'].split('›')] goal_product_category = [x.strip() for x in goal['product_category'].split('›')] category_match = len(set(purchased_product_category) & set(goal_product_category)) >= 2 # Determine whether types align based on product name similarity purchased_type = purchased_product['name'] desired_type = goal['name'] purchased_type_parse = nlp(purchased_type) desired_type_parse = nlp(desired_type) purchased_type_parse = [t.text.lower() for t in purchased_type_parse if t.pos_ in ('PNOUN', 'NOUN', 'PROPN')] desired_type_parse = [t.text.lower() for t in desired_type_parse if t.pos_ in ('PNOUN', 'NOUN', 'PROPN')] n_intersect_type = len( set(purchased_type_parse) & set(desired_type_parse) ) if len(desired_type_parse) == 0: title_score = 0.2 else: title_score = n_intersect_type / len(desired_type_parse) r_type = 1.0 # Adjust r_type score based on query, category title matching/scores match = query_match or category_match or title_score > 0.2 if not match: r_type = 0.5 if title_score < 0.1: r_type = 0.1 if title_score == 0.0: r_type = 0.0 return dict( r_type=r_type, query_match=query_match, category_match=category_match, title_score=title_score, ) def get_attribute_reward(purchased_product, goal): """Determines whether purchased products shares same attributes as goal""" purchased_attrs = purchased_product['Attributes'] goal_attrs = goal['attributes'] num_attr_matches = 0 for g_attr in goal_attrs: matched = False # Check whether goal attribute found in purchased product attribute list for p_attr in purchased_attrs: score = fuzz.token_set_ratio(p_attr, g_attr) if score > 85: num_attr_matches += 1 matched = True break # If not in purchased attrs, check Title, Bullet Points (Features), Desc if ( not matched and ( g_attr in purchased_product['Title'].lower() or g_attr in ' '.join(purchased_product['BulletPoints']).lower() or g_attr in purchased_product['Description'].lower() ) ): num_attr_matches += 1 matched = True r_attr = num_attr_matches / len(goal_attrs) return r_attr, num_attr_matches def get_option_reward(purchased_options, goal_options): """Calculate reward for purchased product's options w.r.t. goal options""" purchased_options = [normalize_color(o) for o in purchased_options] goal_options = [normalize_color(o) for o in goal_options] # Perform fuzzy matching of each purchased option against each goal option num_option_matches = 0 for g_option in goal_options: for p_option in purchased_options: score = fuzz.token_set_ratio(p_option, g_option) if score > 85: num_option_matches += 1 break # Calculate option reward as fraction of goal options hit r_option = num_option_matches / len(goal_options) if len(goal_options) > 0 else None return r_option, num_option_matches def get_reward(purchased_product, goal, price, options, **kwargs): """Get cumulative reward score for purchased product and goal""" r_type_dict = get_type_reward(purchased_product, goal) r_price = ( price <= goal['price_upper'] ) if goal['price_upper'] > 0 else None r_att, num_attr_matches = get_attribute_reward(purchased_product, goal) r_option, num_option_matches = get_option_reward( list(options.values()), goal['goal_options'].items() if isinstance(goal['goal_options'], dict) else goal['goal_options'] ) total_reward = ( (num_attr_matches + num_option_matches + r_price) \ / (len(goal['attributes']) + len(goal['goal_options']) + 1) ) total_reward *= r_type_dict['r_type'] # If verbose flag enabled, store score sub-components into dictionary if kwargs.get('verbose', False): info = { 'r_type': r_type_dict['r_type'], 'r_att': r_att, 'w_att': len(goal['attributes']) / (len(goal['attributes']) + len(goal['goal_options']) + 1), 'query_match': r_type_dict['query_match'], 'category_match': r_type_dict['category_match'], 'title_score': r_type_dict['title_score'], } if r_option is not None: info['r_option'] = r_option info['w_option'] = len(goal['goal_options']) / (len(goal['attributes']) + len(goal['goal_options']) + 1) if r_price is not None: info['r_price'] = r_price info['w_price'] = 1 / (len(goal['attributes']) + len(goal['goal_options']) + 1) return total_reward, info return total_reward ================================================ FILE: envs/webshop/src/webshop/web_agent_site/engine/normalize.py ================================================ import re from typing import Tuple COLOR_SET = [ 'alabaster', 'apricot', 'aqua', 'ash', 'asphalt', 'azure', 'banana', 'beige', 'black', 'blue', 'blush', 'bordeaux', 'bronze', 'brown', 'burgundy', 'camel', 'camo', 'caramel', 'champagne', 'charcoal', 'cheetah', 'chestnut', 'chocolate', 'christmas', 'coffee', 'cognac', 'copper', 'coral', 'cranberry', 'cream', 'crystal', 'dark', 'denim', 'eggplant', 'elephant', 'espresso', 'fuchsia', 'gold', 'granite', 'grape', 'graphite', 'grass', 'gray', 'green', 'grey', 'heather', 'indigo', 'ivory', 'ivy', 'khaki', 'lavender', 'lemon', 'leopard', 'light', 'lilac', 'lime', 'magenta', 'maroon', 'mauve', 'merlot', 'midnight', 'mint', 'mocha', 'multicolor', 'mushroom', 'mustard', 'natural', 'navy', 'nude', 'olive', 'orange', 'peach', 'pewter', 'pink', 'plum', 'purple', 'rainbow', 'red', 'rose', 'royal', 'rust', 'sand', 'sapphire', 'seashell', 'silver', 'skull', 'slate', 'steel', 'stone', 'stonewash', 'sunflower', 'tan', 'taupe', 'teal', 'tiger', 'turquoise', 'violet', 'walnut', 'wheat', 'white', 'wine', 'yellow', ] SIZE_SET = [ 'xx-large', '3x-large', '4x-large', '5x-large', 'x-large', 'x-small', 'medium', 'large', 'small', 'queen', 'twin', 'full', 'king', 'one size', 'pack', ] SIZE_PATTERNS = [ re.compile(r'(.*)neck(.*)sleeve'), re.compile(r'(.*) women \| (.*) men'), re.compile(r'(.*)w x(.*)l'), re.compile(r'(.*)w by (.*)l'), re.compile(r'(.*)w x(.*)h'), re.compile(r'(.*)wide'), re.compile(r'(.*)x-wide'), re.compile(r'(.*)narrow'), re.compile(r'(.*)petite'), re.compile(r'(.*)inch'), re.compile(r'(.*)plus'), re.compile(r'(.*)mm'), re.compile(r'women(.*)'), re.compile(r'(.*)x(.*)'), re.compile(r'(.*)ft'), re.compile(r'(.*)feet'), re.compile(r'(.*)meter'), re.compile(r'(.*)yards'), re.compile(r'(.*)\*(.*)'), re.compile(r'(.*)\-(.*)'), re.compile(r'(\d+)"$'), re.compile(r'(\d+)f$'), re.compile(r'(\d+)m$'), re.compile(r'(\d+)cm$'), re.compile(r'(\d+)g$'), ] SIZE_PATTERNS = [re.compile(s) for s in SIZE_SET] + SIZE_PATTERNS def normalize_color(color_string: str) -> str: """Extracts the first color found if exists""" for norm_color in COLOR_SET: if norm_color in color_string: return norm_color return color_string def normalize_color_size(product_prices: dict) -> Tuple[dict, dict]: """Get mappings of all colors, sizes to corresponding values in COLOR_SET, SIZE_PATTERNS""" # Get all colors, sizes from list of all products all_colors, all_sizes = set(), set() for (_, color, size), _ in product_prices.items(): all_colors.add(color.lower()) all_sizes.add(size.lower()) # Create mapping of each original color value to corresponding set value color_mapping = {'N.A.': 'not_matched'} for c in all_colors: matched = False for base in COLOR_SET: if base in c: color_mapping[c] = base matched = True break if not matched: color_mapping[c] = 'not_matched' # Create mapping of each original size value to corresponding set value size_mapping = {'N.A.': 'not_matched'} for s in all_sizes: matched = False for pattern in SIZE_PATTERNS: m = re.search(pattern, s) if m is not None: matched = True size_mapping[s] = pattern.pattern break if not matched: if s.replace('.', '', 1).isdigit(): size_mapping[s] = 'numeric_size' matched= True if not matched: size_mapping[s] = 'not_matched' return color_mapping, size_mapping ================================================ FILE: envs/webshop/src/webshop/web_agent_site/envs/__init__.py ================================================ from gym.envs.registration import register from envs.webshop.src.webshop.web_agent_site.envs.web_agent_site_env import WebAgentSiteEnv from envs.webshop.src.webshop.web_agent_site.envs.web_agent_text_env import WebAgentTextEnv register( id='WebAgentSiteEnv-v0', entry_point='envs.webshop.src.webshop.web_agent_site.envs:WebAgentSiteEnv', ) register( id='WebAgentTextEnv-v0', entry_point='envs.webshop.src.webshop.web_agent_site.envs:WebAgentTextEnv', ) ================================================ FILE: envs/webshop/src/webshop/web_agent_site/envs/chromedriver ================================================ [File too large to display: 15.9 MB] ================================================ FILE: envs/webshop/src/webshop/web_agent_site/envs/web_agent_site_env.py ================================================ import gym import random import requests import string import time from bs4 import BeautifulSoup from bs4.element import Comment from gym import spaces from os.path import join, dirname, abspath from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import ElementNotInteractableException from ..engine.engine import parse_action, END_BUTTON class WebAgentSiteEnv(gym.Env): """Gym environment for HTML mode of WebShop environment""" def __init__(self, observation_mode='html', **kwargs): """ Constructor for HTML environment Arguments: observation_mode (`str`) -- ['html' | 'text'] (default 'html') pause (`float`) -- Pause (in seconds) after taking an action. This is mainly for demo purposes. Recommended value: 2.0s render (`bool`) -- Show browser if set to `True`. session ('str') -- Session ID to initialize environment with """ super(WebAgentSiteEnv, self).__init__() self.observation_mode = observation_mode self.kwargs = kwargs # Create a browser driver to simulate the WebShop site service = Service(join(dirname(abspath(__file__)), 'chromedriver')) options = Options() if 'render' not in kwargs or not kwargs['render']: options.add_argument("--headless") # don't show browser self.browser = webdriver.Chrome(service=service, options=options) # Set flags and values for WebShop session self.text_to_clickable = None self.assigned_session = kwargs.get('session') self.session = None self.reset() def step(self, action): """ Takes an action, updates WebShop environment, and returns (observation, reward, done, info) Arguments: action (`str`): An action should be of the following structure: - search[keywords] - click[value] If action not valid, perform nothing. """ reward = 0.0 done = False info = None # Map action to executed command on the WebShop environment via the broswer driver action_name, action_arg = parse_action(action) if action_name == 'search': try: search_bar = self.browser.find_element_by_id('search_input') except Exception: pass else: search_bar.send_keys(action_arg) search_bar.submit() elif action_name == 'click': try: self.text_to_clickable[action_arg].click() except ElementNotInteractableException: # Perform force click with JavaScript button = self.text_to_clickable[action_arg] self.browser.execute_script("arguments[0].click();", button) reward = self.get_reward() if action_arg == END_BUTTON: done = True elif action_name == 'end': done = True else: print('Invalid action. No action performed.') if 'pause' in self.kwargs: time.sleep(self.kwargs['pause']) return self.observation, reward, done, info def get_available_actions(self): """Returns list of available actions at the current step""" # Determine if a search bar is available try: search_bar = self.browser.find_element_by_id('search_input') except Exception: has_search_bar = False else: has_search_bar = True # Collect buttons, links, and options as clickables buttons = self.browser.find_elements_by_class_name('btn') product_links = self.browser.find_elements_by_class_name('product-link') buying_options = self.browser.find_elements_by_css_selector("input[type='radio']") self.text_to_clickable = { f'{b.text}': b for b in buttons + product_links } for opt in buying_options: opt_value = opt.get_attribute('value') self.text_to_clickable[f'{opt_value}'] = opt return dict( has_search_bar=has_search_bar, clickables=list(self.text_to_clickable.keys()), ) def _parse_html(self, html=None, url=None): """ Returns web request result wrapped in BeautifulSoup object Arguments: url (`str`): If no url or html is provided, use the current observation (HTML) for parsing. """ if html is None: if url is not None: html = requests.get(url) else: html = self.state['html'] html_obj = BeautifulSoup(html, 'html.parser') return html_obj def get_reward(self): """Get reward value at current step of the environment""" html_obj = self._parse_html() r = html_obj.find(id='reward') r = float(r.findChildren("pre")[0].string) if r is not None else 0.0 return r def get_instruction_text(self): """Get corresponding instruction text for environment current step""" html_obj = self._parse_html(self.browser.page_source) instruction_text = html_obj.find(id='instruction-text').h4.text return instruction_text def convert_html_to_text(self, html): """Strip HTML of tags and add separators to convert observation into simple mode""" texts = self._parse_html(html).find_all(string=True) visible_texts = filter(tag_visible, texts) observation = ' [SEP] '.join(t.strip() for t in visible_texts if t != '\n') return observation @property def state(self): """ State that includes all information. The actual observation are likely to be a subset or reduced form of the state. """ return dict( url=self.browser.current_url, html=self.browser.page_source, instruction_text=self.instruction_text, ) @property def observation(self): """Compiles state into either the `html` or `text` observation mode""" html = self.state['html'] if self.observation_mode == 'html': return html elif self.observation_mode == 'text': return self.convert_html_to_text(html) else: raise ValueError( f'Observation mode {self.observation_mode} not supported.' ) @property def action_space(self): # Recommended to use `get_available_actions` instead return NotImplementedError @property def observation_space(self): return NotImplementedError def reset(self): """Create a new session and reset environment variables""" if self.assigned_session is not None: self.session = self.assigned_session else: self.session = ''.join(random.choices(string.ascii_lowercase, k=5)) init_url = f'http://127.0.0.1:3000/{self.session}' self.browser.get(init_url) self.instruction_text = self.get_instruction_text() return self.observation, None def render(self, mode='human'): # TODO: Render observation in terminal or WebShop website return NotImplementedError def close(self): # TODO: When DB used instead of JSONs, tear down DB here self.browser.close() print('Browser closed.') def tag_visible(element): """Helper method to strip HTML block of extraneous tags""" ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'} return ( element.parent.name not in ignore and not isinstance(element, Comment) ) ================================================ FILE: envs/webshop/src/webshop/web_agent_site/envs/web_agent_text_env.py ================================================ import os import gym import json import random import string import time import torch import pickle from bs4 import BeautifulSoup from bs4.element import Comment from collections import defaultdict from flask import Flask from ..engine.engine import ( load_products, init_search_engine, get_top_n_product_from_keywords, map_action_to_html, parse_action, get_product_per_page, ACTION_TO_TEMPLATE, END_BUTTON, NEXT_PAGE, PREV_PAGE, BACK_TO_SEARCH, ) from ..engine.goal import get_reward, get_goals from ..utils import ( DEFAULT_FILE_PATH, FEAT_CONV, FEAT_IDS, random_idx ) app = Flask(__name__) class WebAgentTextEnv(gym.Env): """Gym environment for Text mode of WebShop environment""" def __init__( self, observation_mode='html', file_path=DEFAULT_FILE_PATH, server=None, **kwargs ): """ Constructor for text environment Arguments: observation_mode (`str`) -- ['html' | 'text'] (default 'html') get_image filter_goals limit_goals num_products human_goals session session_prefix show_attrs """ super(WebAgentTextEnv, self).__init__() self.observation_mode = observation_mode self.kwargs = kwargs self.file_path = file_path self.base_url = 'http://127.0.0.1:3000' self.server = SimServer( self.base_url, self.file_path, self.kwargs.get('filter_goals'), self.kwargs.get('limit_goals', -1), self.kwargs.get('num_products'), self.kwargs.get('human_goals'), self.kwargs.get('show_attrs', False), self.kwargs.get('quiet', False), ) if server is None else server self.browser = SimBrowser(self.server) self.session = self.kwargs.get('session') self.session_prefix = self.kwargs.get('session_prefix') if self.kwargs.get('get_image', 0): self.feats = torch.load(FEAT_CONV) self.ids = torch.load(FEAT_IDS) self.ids = {url: idx for idx, url in enumerate(self.ids)} self.prev_obs = [] self.prev_actions = [] self.num_prev_obs = self.kwargs.get('num_prev_obs', 0) self.num_prev_actions = self.kwargs.get('num_prev_actions', 0) self.reset() def step(self, action): """ Takes an action, updates WebShop environment, and returns (observation, reward, done, info) Arguments: action (`str`): An action should be of the following structure: - search[keywords] - click[value] If action not valid, perform nothing. """ info = None self.get_available_actions() # Determine action type (click, search) and argument action_name, action_arg = parse_action(action) action_name = action_name.lower() if action_arg is not None: action_arg = action_arg.lower() if (action_name == 'search' and action_arg is not None and action_arg != ''): status = self.browser.search(action_arg) elif (action_name == 'click' and action_arg in self.text_to_clickable.keys() and action_arg != 'search'): status = self.browser.click(action_arg, self.text_to_clickable) else: status = dict(reward=0, done=False) # Update observation, state with the new action ob = self.observation text_list = [ob] self.prev_actions.append(action) for i in range(1, 1 + max(self.num_prev_obs, self.num_prev_actions)): if len(self.prev_actions) >= i and self.num_prev_actions >= i: text_list.append(self.prev_actions[-i]) if len(self.prev_obs) >= i and self.num_prev_obs >= i: text_list.append(self.prev_obs[-i]) state = ' [SEP] '.join(text_list[::-1]) self.prev_obs.append(ob) return state, status['reward'], status['done'], info def get_available_actions(self): """Returns list of available actions at the current step""" html_obj = self._parse_html() # Collect search bar, buttons, links, and options as clickables search_bar = html_obj.find(id='search_input') has_search_bar = True if search_bar is not None else False buttons = html_obj.find_all(class_='btn') product_links = html_obj.find_all(class_='product-link') buying_options = html_obj.select('input[type="radio"]') self.text_to_clickable = { f'{b.get_text()}'.lower(): b for b in buttons + product_links } for opt in buying_options: opt_value = opt.get('value') self.text_to_clickable[f'{opt_value}'] = opt return dict( has_search_bar=has_search_bar, clickables=list(self.text_to_clickable.keys()), ) def get_image(self): """Scrape image from page HTML and return as a list of pixel values""" html_obj = self._parse_html(self.browser.page_source) image_url = html_obj.find(id='product-image') if image_url is not None: image_url = image_url['src'] if image_url in self.ids: image_idx = self.ids[image_url] image = self.feats[image_idx] return image return torch.zeros(512) def get_instruction_text(self): """Get corresponding instruction text for current environment session""" html_obj = self._parse_html(self.browser.page_source) instruction_text = html_obj.find(id='instruction-text').h4.text return instruction_text def _parse_html(self, html=None): """ Returns web request result wrapped in BeautifulSoup object Arguments: url (`str`): If no url or html is provided, use the current observation (HTML) for parsing. """ if html is None: html = self.state['html'] html_obj = BeautifulSoup(html, 'html.parser') return html_obj @property def observation(self): """Compiles state into either the `html` or `text` observation mode""" html = self.state['html'] if self.observation_mode == 'html': return html elif self.observation_mode == 'text': return self.convert_html_to_text(html, simple=True) elif self.observation_mode == 'text_rich': return self.convert_html_to_text(html, simple=False) elif self.observation_mode == 'url': return self.state['url'] else: raise ValueError( f'Observation mode {self.observation_mode} not supported.' ) @property def state(self): """ State that includes all information. The actual observation are likely to be a subset or reduced form of the state. """ return dict( url=self.browser.current_url, html=self.browser.page_source, instruction_text=self.instruction_text, ) def convert_html_to_text(self, html, simple=False): """Strip HTML of tags and add separators to convert observation into simple mode""" texts = self._parse_html(html).find_all(string=True) visible_texts = filter(tag_visible, texts) if simple: # For `simple` mode, return just [SEP] separators return ' [SEP] '.join(t.strip() for t in visible_texts if t != '\n') else: # Otherwise, return an observation with tags mapped to specific, unique separators observation = '' for t in visible_texts: if t == '\n': continue if t.parent.name == 'button': # button processed_t = f'[button] {t} [button_]' elif t.parent.name == 'label': # options if f'"{t}"' in self.state['url']: processed_t = f' [clicked button] {t} [clicked button_]' observation = f'You have clicked {t}.\n' + observation else: processed_t = f' [button] {t} [button_]' elif t.parent.get('class') == ["product-link"]: # product asins if f'{t}' in self.server.user_sessions[self.session]['asins']: processed_t = f'\n[clicked button] {t} [clicked button_]' else: processed_t = f'\n[button] {t} [button_]' else: # regular, unclickable text processed_t = str(t) observation += processed_t + '\n' return observation def reset(self, session=None, instruction_text=None): """Create a new session and reset environment variables""" session_int = None if session is not None: self.session = str(session) if isinstance(session, int): session_int = session else: self.session = ''.join(random.choices(string.ascii_lowercase, k=10)) if self.session_prefix is not None: self.session = self.session_prefix + self.session init_url = f'{self.base_url}/{self.session}' self.browser.get(init_url, session_id=self.session, session_int=session_int) self.text_to_clickable = None self.instruction_text = self.get_instruction_text() if instruction_text is None else instruction_text obs = self.observation self.prev_obs = [obs] self.prev_actions = [] return obs, None def render(self, mode='human'): pass def close(self): pass def tag_visible(element): ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'} return ( element.parent.name not in ignore and not isinstance(element, Comment) ) class SimServer: """Lightweight simulator of WebShop Flask application for generating HTML observations""" def __init__( self, base_url, file_path, filter_goals=None, limit_goals=-1, num_products=None, human_goals=0, show_attrs=False, quiet=False, ): """ Constructor for simulated server serving WebShop application Arguments: filter_goals (`func`) -- Select specific goal(s) for consideration based on criteria of custom function limit_goals (`int`) -- Limit to number of goals available num_products (`int`) -- Number of products to search across human_goals (`bool`) -- If true, load human goals; otherwise, load synthetic goals """ # Load all products, goals, and search engine self.base_url = base_url self.quiet = bool(quiet) # cache_path = os.path.join(os.getcwd(), '.cache') # if os.path.exists(cache_path): # self.all_products = pickle.load(open(os.path.join(cache_path, 'all_products.pkl'), 'rb')) # self.product_item_dict = pickle.load(open(os.path.join(cache_path, 'product_item_dict.pkl'), 'rb')) # self.product_prices = pickle.load(open(os.path.join(cache_path, 'product_prices.pkl'), 'rb')) # self.goals = pickle.load(open(os.path.join(cache_path, 'goals.pkl'), 'rb')) # else: # self.all_products, self.product_item_dict, self.product_prices, _ = \ # load_products(filepath=file_path, num_products=num_products, human_goals=human_goals) # self.goals = get_goals(self.all_products, self.product_prices, human_goals) # os.mkdir(cache_path) # pickle.dump(self.all_products, open(os.path.join(cache_path, 'all_products.pkl'), 'wb')) # pickle.dump(self.product_item_dict, open(os.path.join(cache_path, 'product_item_dict.pkl'), 'wb')) # pickle.dump(self.product_prices, open(os.path.join(cache_path, 'product_prices.pkl'), 'wb')) # pickle.dump(self.goals, open(os.path.join(cache_path, 'goals.pkl'), 'wb')) # self.search_engine = init_search_engine(num_products=num_products) self.all_products, self.product_item_dict, self.product_prices, _ = \ load_products(filepath=file_path, num_products=num_products, human_goals=human_goals, quiet=self.quiet) self.search_engine = init_search_engine(num_products=num_products) self.goals = get_goals(self.all_products, self.product_prices, human_goals, quiet=self.quiet) self.show_attrs = show_attrs # Fix outcome for random shuffling of goals random.seed(233) random.shuffle(self.goals) # Apply `filter_goals` parameter if exists to select speific goal(s) if filter_goals is not None: self.goals = [ goal for (i, goal) in enumerate(self.goals) if filter_goals(i, goal) ] # Imposes `limit` on goals via random selection if limit_goals != -1 and limit_goals < len(self.goals): self.weights = [goal['weight'] for goal in self.goals] self.cum_weights = [0] for w in self.weights: self.cum_weights.append(self.cum_weights[-1] + w) idxs = [] while len(idxs) < limit_goals: idx = random_idx(self.cum_weights) if idx not in idxs: idxs.append(idx) self.goals = [self.goals[i] for i in idxs] if not self.quiet: print(f'Loaded {len(self.goals)} goals.') # pickle.dump(self.goals, open(os.path.join(cache_path, 'goals_final.pkl'), 'wb')) # Set extraneous housekeeping variables self.weights = [goal['weight'] for goal in self.goals] self.cum_weights = [0] for w in self.weights: self.cum_weights.append(self.cum_weights[-1] + w) self.user_sessions = dict() self.search_time = 0 self.render_time = 0 self.sample_time = 0 self.assigned_instruction_text = None # TODO: very hacky, should remove @app.route('/', methods=['GET', 'POST']) def index(self, session_id, **kwargs): """Redirect to the search page with the given session ID""" html = map_action_to_html( 'start', session_id=session_id, instruction_text=kwargs['instruction_text'], ) url = f'{self.base_url}/{session_id}' return html, url @app.route('/', methods=['GET', 'POST']) def search_results(self, session_id, **kwargs): """Initialize session and return the search results page""" session = self.user_sessions[session_id] keywords = kwargs['keywords'] # TODO: why is this using kwargs? why not session? assert isinstance(keywords, list) page = 1 if 'page' not in kwargs else kwargs['page'] session["page"] = page session["keywords"] = keywords session["actions"]["search"] += 1 session["asin"] = None session["options"] = {} # Perform search on keywords from items and record amount of time it takes old_time = time.time() top_n_products = get_top_n_product_from_keywords( keywords, self.search_engine, self.all_products, self.product_item_dict, ) self.search_time += time.time() - old_time # Get product list from search result asins and get list of corresponding URLs products = get_product_per_page(top_n_products, page) keywords_url_string = '+'.join(keywords) url = ( f'{self.base_url}/search_results/{session_id}/' f'{keywords_url_string}/{page}' ) # Render HTML search page and record amount of time taken old_time = time.time() html = map_action_to_html( 'search', session_id=session_id, products=products, keywords=session["keywords"], page=page, total=len(top_n_products), instruction_text=session["goal"]["instruction_text"], ) self.render_time += time.time() - old_time return html, url @app.route('/', methods=['GET', 'POST']) def item_page(self, session_id, **kwargs): """Render and return the HTML for a product item page""" session = self.user_sessions[session_id] clickable_name = kwargs['clickable_name'] text_to_clickable = kwargs['text_to_clickable'] clickable = text_to_clickable[clickable_name] # Update session logs with information of last product asin selected if (clickable.get('class') is not None and clickable.get('class')[0] == 'product-link'): session["asin"] = clickable_name.upper() session["actions"]["asin"] += 1 session["asins"].add(session["asin"]) elif clickable.get('name') is not None: clickable_key = clickable['name'].lower() session["options"][clickable_key] = clickable_name session["actions"]["options"] += 1 # Set fields + url of page, then render page's HTML product_info = self.product_item_dict[session["asin"]] keywords_url_string = '+'.join(session["keywords"]) option_string = json.dumps(session['options']) url = ( f'{self.base_url}/item_page/{session_id}/' f'{session["asin"]}/{keywords_url_string}/' f'{session["page"]}/{option_string}' ) html = map_action_to_html( 'click', session_id=session_id, product_info=product_info, keywords=session["keywords"], page=session["page"], asin=session["asin"], options=session["options"], instruction_text=session["goal"]["instruction_text"], show_attrs=self.show_attrs, ) return html, url @app.route('/', methods=['GET', 'POST']) def item_sub_page(self, session_id, **kwargs): """Render and return the HTML for a product's sub page (i.e. description, features)""" session = self.user_sessions[session_id] clickable_name = kwargs['clickable_name'] for k in ACTION_TO_TEMPLATE: if clickable_name.lower() == k.lower(): clickable_name = k break # Set fields + url of page, then render page's HTML product_info = self.product_item_dict[session["asin"]] session["actions"][clickable_name] += 1 keywords_url_string = '+'.join(session["keywords"]) url = ( f'{self.base_url}/item_sub_page/{session_id}/' f'{session["asin"]}/{keywords_url_string}/{session["page"]}/' f'{clickable_name}/{session["options"]}' ) html = map_action_to_html( f'click[{clickable_name}]', session_id=session_id, product_info=product_info, keywords=session["keywords"], page=session["page"], asin=session["asin"], options=session["options"], instruction_text=session["goal"]["instruction_text"], ) return html, url @app.route('/', methods=['GET', 'POST']) def done(self, session_id, **kwargs): """Render and return HTML for done page""" session = self.user_sessions[session_id] goal = self.user_sessions[session_id]['goal'] purchased_product = self.product_item_dict[session["asin"]] session["actions"]["purchase"] += 1 price = self.product_prices.get(session["asin"]) # Calculate reward for selected product and set variables for page details reward, info = get_reward( purchased_product, goal, price=price, options=session["options"], verbose=True ) self.user_sessions[session_id]['verbose_info'] = info self.user_sessions[session_id]['done'] = True self.user_sessions[session_id]['reward'] = reward url = ( f'{self.base_url}/done/{session_id}/' f'{session["asin"]}/{session["options"]}' ) html = map_action_to_html( f'click[{END_BUTTON}]', session_id=session_id, reward=reward, asin=session["asin"], options=session["options"], instruction_text=session["goal"]["instruction_text"], ) return html, url, reward def receive(self, session_id, current_url, session_int=None, **kwargs): """Map action to the corresponding page""" status = dict(reward=0.0, done=False) with app.app_context(), app.test_request_context(): # Create/determine goal, instruction_text from current session if session_id not in self.user_sessions: idx = session_int if (session_int is not None and isinstance(session_int, int)) else random_idx(self.cum_weights) goal = self.goals[idx] instruction_text = goal['instruction_text'] self.user_sessions[session_id] = {'goal': goal, 'done': False} else: instruction_text = \ self.user_sessions[session_id]['goal']['instruction_text'] if self.assigned_instruction_text is not None: instruction_text = self.assigned_instruction_text # TODO: very hacky, should remove self.user_sessions[session_id]['goal']['instruction_text'] = instruction_text session = self.user_sessions[session_id] if not kwargs: # If no action, reset the session variables kwargs['instruction_text'] = instruction_text html, url = self.index(session_id, **kwargs) self.user_sessions[session_id].update( { 'keywords': None, 'page': None, 'asin': None, 'asins': set(), 'options': dict(), 'actions': defaultdict(int) } ) elif 'keywords' in kwargs: # If search keywords are available, run a search html, url = self.search_results(session_id, **kwargs) elif 'clickable_name' in kwargs: clickable_name = kwargs['clickable_name'].lower() if clickable_name == END_BUTTON.lower(): # If "buy now" clicked, calculate reward and flag session as terminated html, url, reward = self.done(session_id, **kwargs) status['reward'] = reward status['done'] = True elif clickable_name == BACK_TO_SEARCH.lower(): # If "back to search" clicked, recursively reset the session back to search page html, url, status = self.receive(session_id, current_url) elif (clickable_name == NEXT_PAGE.lower() and self.get_page_name(current_url) == 'search_results'): # If "next page" clicked from search results, re-render with `page` enumerated html, url, status = self.receive( session_id, current_url, keywords=session["keywords"], page=session["page"] + 1, ) elif (clickable_name == PREV_PAGE.lower() and self.get_page_name(current_url) == 'search_results'): # If "prev page" clicked from search results, re-render with `page` denumerated html, url, status = self.receive( session_id, current_url, keywords=session["keywords"], page=session["page"] - 1, ) elif (clickable_name == PREV_PAGE.lower() and self.get_page_name(current_url) == 'item_sub_page'): # If "prev page" clicked from sub page, return to corresponding item page html, url = self.item_page(session_id, **kwargs) elif (clickable_name == PREV_PAGE.lower() and self.get_page_name(current_url) == 'item_page'): # If "prev page" clicked from item page, return to search results page html, url = self.search_results( session_id, keywords=session["keywords"], page=session["page"], **kwargs ) elif clickable_name in [k.lower() for k in ACTION_TO_TEMPLATE]: # Render item_sub_page if clickable is description, features, or reviews html, url = self.item_sub_page(session_id, **kwargs) else: # Otherwise, render current item page html, url = self.item_page(session_id, **kwargs) return html, url, status def get_page_name(self, url): """Determine which page (i.e. item_page, search_results) the given URL is pointing at""" if url is None: return None page_names = [ 'search_results', 'item_page', 'item_sub_page', 'done' ] for page_name in page_names: if page_name in url: return page_name return '' # index page class SimBrowser: """Simulated browser for rendering the HTML source of WebShop environment pages""" def __init__(self, server): self.server = server self.current_url = None self.page_source = None self.session_id = None def get(self, url, session_id=None, session_int=None): """Set browser variables to corresponding link, page HTML for URL""" self.session_id = url.split('/')[-1] if session_id is None else session_id self.page_source, _, _ = \ self.server.receive(self.session_id, self.current_url, session_int=session_int) self.current_url = url def click(self, clickable_name, text_to_clickable): """Wrapper for `receive` handler for performing click action on current page""" self.page_source, self.current_url, status = \ self.server.receive( self.session_id, current_url=self.current_url, clickable_name=clickable_name, text_to_clickable=text_to_clickable, ) return status def search(self, keywords): """Wrapper for `receive` handler for performing search action on current page""" if isinstance(keywords, str): keywords = keywords.split(' ') self.page_source, self.current_url, status = \ self.server.receive( self.session_id, current_url=self.current_url, keywords=keywords, ) return status ================================================ FILE: envs/webshop/src/webshop/web_agent_site/models/__init__.py ================================================ from .models import ( HumanPolicy, RandomPolicy, ) ================================================ FILE: envs/webshop/src/webshop/web_agent_site/models/models.py ================================================ """ Model implementations. The model interface should be suitable for both the ``site env'' and the ``text env''. """ import random random.seed(4) class BasePolicy: def __init__(self): pass def forward(observation, available_actions): """ Args: observation (`str`): HTML string available_actions (): ... Returns: action (`str`): Return string of the format ``action_name[action_arg]''. Examples: - search[white shoes] - click[button=Reviews] - click[button=Buy Now] """ raise NotImplementedError class HumanPolicy(BasePolicy): def __init__(self): super().__init__() def forward(self, observation, available_actions): action = input('> ') return action class RandomPolicy(BasePolicy): def __init__(self): super().__init__() def forward(self, observation, available_actions): if available_actions['has_search_bar']: action = 'search[shoes]' else: action_arg = random.choice(available_actions['clickables']) action = f'click[{action_arg}]' return action ================================================ FILE: envs/webshop/src/webshop/web_agent_site/static/style.css ================================================ .text { font-family: Arial, Helvetica; } * { -webkit-border-radius: 1px !important; -moz-border-radius: 1px !important; border-radius: 1px !important; } #logo { color: #666; width:100%; } #logo h1 { font-size: 60px; text-shadow: 1px 2px 3px #999; font-family: Roboto, sans-serif; font-weight: 700; letter-spacing: -1px; } #logo p{ padding-bottom: 20px; } #thankyou { color: #666; width:100%; font-size: 60px; font-family: Roboto, sans-serif; font-weight: 700; padding-bottom: 20px; letter-spacing: -1px; } #form-buscar >.form-group >.input-group > .form-control { height: 40px; } #form-buscar >.form-group >.input-group > .input-group-btn > .btn{ height: 40px; font-size: 16px; font-weight: 300; } #form-buscar >.form-group >.input-group > .input-group-btn > .btn .glyphicon{ margin-right:12px; } #form-buscar >.form-group >.input-group > .form-control { font-size: 16px; font-weight: 300; } #form-buscar >.form-group >.input-group > .form-control:focus { border-color: #33A444; outline: 0; -webkit-box-shadow: inset 0 1px 1px rgba(0,0,0,.075), 0 0 1px rgba(0, 109, 0, 0.8); box-shadow: inset 0 1px 1px rgba(0,0,0,.075), 0 0 1px rgba(0, 109, 0, 0.8); } body { background: white; min-height: 100vh } .text-gray { color: #aaa } .result-img { max-height: 300px; max-width: 300px; overflow: hidden; } .item-page-img { max-height: 600px; max-width: 370px; overflow: hidden; } .top-buffer { margin-top:10px; } .product-info { font-size: 18px; } .star-active { color: #FBC02D; margin-top: 10px; margin-bottom: 10px } .star-active:hover { color: #F9A825; cursor: pointer } .star-inactive { color: #CFD8DC; margin-top: 10px; margin-bottom: 10px } .blue-text { color: #116396 } .btn { margin-left: 0px; margin-right: 0px; } /* Boostrap Buttons Styling */ .btn-primary { font-size: 13px; color: rgba(58, 133, 191, 0.75); letter-spacing: 1px; line-height: 15px; border: 2px solid rgba(58, 133, 191, 0.75); border-radius: 40px; background: transparent; } .btn-primary:hover { color: #FFF; background: rgba(58, 133, 191, 0.75); } .btn-success { font-size: 13px; color: rgba(103, 192, 103, 0.75); letter-spacing: 1px; line-height: 15px; border: 2px solid rgba(103, 192, 103, 0.75); border-radius: 40px; background: transparent; } .btn-success:hover { color: #FFF; background: rgb(103, 192, 103, 0.75); } .btn.purchase { color: rgb(0, 0, 0); background: rgb(250, 167, 13); } .btn.purchase:hover { color: rgb(0, 0, 0); background: rgb(253, 199, 98); } .radio-toolbar { margin: 5px; } .radio-toolbar input[type="radio"] { opacity: 0; position: fixed; width: 0; } .radio-toolbar label { display: inline-block; background-color: rgb(245, 241, 241); padding: 10px 10px; font-size: 14px; border: 1px solid #444; border-radius: 4px; } .radio-toolbar label:hover { background-color: rgb(255, 247, 217); } .radio-toolbar input[type="radio"]:focus + label { border: 1px solid #444; } .radio-toolbar input[type="radio"]:checked + label { background-color: rgb(255, 234, 163); border: 1px solid #444; } #instruction-text { margin-top:10px; margin-bottom:10px; border: #797474 solid; border-radius: 20px; padding: 5px; } pre { white-space: pre-line; } ================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/attributes_page.html ================================================================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/description_page.html ================================================Instruction:
{{ instruction_text }}{% for attribute in product_info.Attributes %}
- {% endfor %}
{{attribute}}
{{product_info.category}}
{{product_info.query}}
{{product_info.product_category}}
================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/done_page.html ================================================Instruction:
{{ instruction_text }}{{product_info.Description}}
================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/features_page.html ================================================Thank you for shopping with us!
Your code:
{{ mturk_code }}(Paste it in your MTurk interface.)
Your score (min 0.0, max 1.0)
{{ reward }}Reward Details
{{ reward_info | pprint }}================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/item_page.html ================================================Instruction:
{{ instruction_text }}{% for bulletpoint in product_info.BulletPoints %}
- {% endfor %}
{{bulletpoint}}
================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/results_page.html ================================================Instruction:
{{ instruction_text }}{% for option_name, option_contents in product_info.options.items() %}
{% endfor %}{{ option_name }}
{{product_info.Title}}
Price: {{product_info.Price}}
Rating: {{product_info.Rating}}
{% if show_attrs %}{% endif %}================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/review_page.html ================================================Instruction:
{{ instruction_text }}{% if page > 1 %}Page {{page}} (Total results: {{total}})
{% else %}{% endif %}{% for item in products %}{% endfor %}![]()
{% set item_page_url = url_for('item_page', session_id=session_id, asin=item.asin, keywords=keywords, page=page, options=dict() ) %}{{item.asin}}
{{item.Title}}
{{item.Price}}
================================================ FILE: envs/webshop/src/webshop/web_agent_site/templates/search_page.html ================================================Instruction:
{{ instruction_text }}{% for review in product_info.Reviews %}{% endfor %}"{{review.title}}"
{{review.score}} {% for i in range(review.score | int) %} {% endfor %} {% for i in range(5 - review.score | int) %} {% endfor %}
{{review.body}}
================================================ FILE: envs/webshop/src/webshop/web_agent_site/utils.py ================================================ import os import bisect import hashlib import logging import random from os.path import dirname, abspath, join BASE_DIR = join(dirname(abspath(__file__)), '../..') DEBUG_PROD_SIZE = None # set to `None` to disable DEFAULT_ATTR_PATH = join(BASE_DIR, '../data/items_ins_v2.json') DEFAULT_FILE_PATH = join(BASE_DIR, '../data/items_shuffle.json') DEFAULT_REVIEW_PATH = join(BASE_DIR, '../data/reviews.json') FEAT_CONV = join(BASE_DIR, '../data/feat_conv.pt') FEAT_IDS = join(BASE_DIR, '../data/feat_ids.pt') HUMAN_ATTR_PATH = join(BASE_DIR, '../data/items_human_ins.json') # HUMAN_ATTR_PATH = join(BASE_DIR, '../data/items_human_ins.json') def random_idx(cum_weights): """Generate random index by sampling uniformly from sum of all weights, then selecting the `min` between the position to keep the list sorted (via bisect) and the value of the second to last index """ pos = random.uniform(0, cum_weights[-1]) idx = bisect.bisect(cum_weights, pos) idx = min(idx, len(cum_weights) - 2) return idx def setup_logger(session_id, user_log_dir): """Creates a log file and logging object for the corresponding session ID""" logger = logging.getLogger(session_id) formatter = logging.Formatter('%(message)s') file_handler = logging.FileHandler( user_log_dir / f'{session_id}.jsonl', mode='w' ) file_handler.setFormatter(formatter) logger.setLevel(logging.INFO) logger.addHandler(file_handler) return logger def generate_mturk_code(session_id: str) -> str: """Generates a redeem code corresponding to the session ID for an MTurk worker once the session is completed """ sha = hashlib.sha1(session_id.encode()) return sha.hexdigest()[:10].upper() ================================================ FILE: requirements.txt ================================================ openai==2.6.1 rich==14.2.0 torch==2.9.0 ================================================ FILE: run.py ================================================ import argparse import asyncio import importlib import time import shutil from pathlib import Path from typing import Type, Dict, Any, List, Optional import inspect import logging from datetime import datetime # Suppress common warnings - must be at the very beginning import warnings import os os.environ['PYTHONWARNINGS'] = 'ignore::DeprecationWarning' # Suppress all categories of warnings warnings.simplefilter("ignore") warnings.filterwarnings("ignore") # Make sure these are applied globally import sys if not sys.warnoptions: warnings.simplefilter("ignore") # Specific warning suppressions warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) # Module-specific suppressions warnings.filterwarnings("ignore", module="gym") warnings.filterwarnings("ignore", module="gym.*") warnings.filterwarnings("ignore", module="faiss") warnings.filterwarnings("ignore", module="faiss.*") warnings.filterwarnings("ignore", module="setuptools") warnings.filterwarnings("ignore", module="setuptools.*") warnings.filterwarnings("ignore", module="typer") warnings.filterwarnings("ignore", module="typer.*") warnings.filterwarnings("ignore", module="spacy") warnings.filterwarnings("ignore", module="spacy.*") warnings.filterwarnings("ignore", module="click") warnings.filterwarnings("ignore", module="click.*") from tqdm import tqdm from base.agent import Agent from base.environment import Env from utils.logger import SimpleLogger from utils.errors import StepLimitError # Allow short aliases like `-a human` and `-e alfworld` AGENT_ALIASES = { "recode": "agents.recode.agent.ReCodeAgent", } ENV_ALIASES = { "alfworld": "envs.alfworld.env.AlfworldEnv", "webshop": "envs.webshop.env.WebShopEnv", "sciworld": "envs.sciworld.env.SciWorldEnv", } def resolve_class_identifier(identifier: str, aliases: Dict[str, str], kind: str) -> str: """Resolve a possibly-short alias (e.g., 'human') to a full dotted class path. If `identifier` already looks like a dotted path, return it unchanged. Otherwise, look up a lowercase alias in `aliases`. """ if not identifier: raise ValueError(f"Empty {kind} identifier") if "." in identifier: return identifier key = identifier.strip().lower() if key in aliases: return aliases[key] available = ", ".join(sorted(aliases.keys())) raise ValueError(f"Unknown {kind} alias '{identifier}'. Available: {available}") def _default_run_id(agent_path: str, env_path: str) -> str: """Generate default run_id =WebShop
Instruction:
{{ instruction_text }}_ _ .""" ts = datetime.now().strftime("%Y%m%d_%H%M%S") agent_cls_name = agent_path.split(".")[-1] env_cls_name = env_path.split(".")[-1] return f"{ts}_{agent_cls_name}_{env_cls_name}" def create_instance(cls: Type, running_config: Optional[Dict[str, Any]], logger: Optional[SimpleLogger]): """Instantiate a class, injecting logger and config-defined constructor kwargs.""" sig = inspect.signature(cls) kwargs: Dict[str, Any] = {} if logger is not None and "logger" in sig.parameters: kwargs["logger"] = logger for k in running_config or {}: if k in sig.parameters and k not in kwargs: kwargs[k] = running_config[k] if "task_type" in sig.parameters and running_config and "task_types" in running_config: task_types = running_config.get("task_types", []) if isinstance(task_types, list) and task_types: kwargs["task_type"] = task_types[0].upper() elif isinstance(task_types, str): kwargs["task_type"] = task_types.upper() try: return cls(**kwargs) # type: ignore[arg-type] except TypeError: return cls() def load_class(path: str) -> Type: """Import a class given a dotted path "package.module.Class" only.""" try: module_path, class_name = path.rsplit(".", 1) except ValueError: raise ValueError(f"Invalid class path '{path}'. Expected format: package.module.Class") module = importlib.import_module(module_path) cls = getattr(module, class_name, None) if not isinstance(cls, type): raise AttributeError(f"'{path}' does not resolve to a class") return cls def _safe_report(obj: Any) -> Dict[str, Any]: """Call obj.report() if available and return a dict; otherwise return {}.""" try: if hasattr(obj, "report") and callable(getattr(obj, "report")): data = getattr(obj, "report")() or {} return data if isinstance(data, dict) else {} except Exception: return {} return {} def _assemble_result( agent: Agent, env: Env, instance_id: Optional[int], duration: float, error: Optional[str] = None, ) -> Dict[str, Any]: """Assemble the unified result dict from agent/env reports plus local info.""" agent_report = _safe_report(agent) # print(f"[HERE] {agent_report}") env_report = _safe_report(env) # Ensure task_type present if env exposes it if hasattr(env, "task_type") and "task_type" not in env_report: try: env_report["task_type"] = getattr(env, "task_type") except Exception: pass local_info: Dict[str, Any] = { "instance_id": instance_id, "time": duration, } if error is not None: local_info["error"] = error return {**agent_report, **env_report, **local_info} async def run_single_instance( agent: Agent, env: Env, config: Dict[str, Any], logger: SimpleLogger, instance_id: Optional[int] = None, ) -> Dict[str, Any]: """Run one episode and collect result dict (async).""" # Determine per-instance time limit (seconds). Default to 900s if unspecified. try: max_duration_cfg = config.get("max_duration", 900) time_limit_secs = float(max_duration_cfg if max_duration_cfg is not None else 900) if time_limit_secs <= 0: time_limit_secs = 900.0 except Exception: time_limit_secs = 900.0 init_info = env.reset(config, str(instance_id) if instance_id is not None else None) observations = init_info["observations"] agent.reset(config, init_info) logger.info(f"[Instance {instance_id}] Environment reset. Starting episode.") start_time = time.time() async def episode_runner() -> Dict[str, Any]: nonlocal observations try: while not env.is_done(): actions = await agent.act(observations) observations = await env.run(actions) success = env.is_success() duration_local = time.time() - start_time final_steps_local = env._step_count logger.info( f"{env.id}-Finished: {'SUCCESS' if success else 'FAILURE'} " f"({final_steps_local} steps, {duration_local:.4f}s)" ) return _assemble_result(agent, env, instance_id, duration_local) except StepLimitError as e: duration_local = time.time() - start_time final_steps_local = env._step_count logger.warning(f"[Instance {instance_id}] {e} ({final_steps_local} steps, {duration_local:.4f}s)") return _assemble_result(agent, env, instance_id, duration_local, error=str(e)) except Exception as e: duration_local = time.time() - start_time try: final_steps_local = env.get_step_count() except Exception: final_steps_local = getattr(env, "_step_count", 0) logger.error(f"{env.id}-ERROR: {e} ({final_steps_local} steps, {duration_local:.4f}s)") return _assemble_result(agent, env, instance_id, duration_local, error=str(e)) try: result = await asyncio.wait_for(episode_runner(), timeout=time_limit_secs) return result except asyncio.TimeoutError: duration = time.time() - start_time try: final_steps = env.get_step_count() except Exception: final_steps = getattr(env, "_step_count", 0) logger.warning( f"[Instance {instance_id}] TIMEOUT after {int(time_limit_secs)}s " f"({final_steps} steps, {duration:.4f}s)" ) res = _assemble_result( agent, env, instance_id, duration, error=f"Timeout after {int(time_limit_secs)}s" ) # Explicitly mark as failure to ensure correct final statistics res["success"] = False return res async def run_concurrent_instances( agent_cls: Type[Agent], env_cls: Type[Env], num_instances: int, max_concurrent: int = 10, config: Optional[Dict[str, Any]] = None, logger: Optional[SimpleLogger] = None, ) -> List[Dict[str, Any]]: """Run many environment instances concurrently with a live progress UI. If `rich` is available, use a richer UI with per-instance spinners; otherwise fallback to tqdm-based overall bar plus lightweight per-instance lines. """ config = config or {} # Determine base start id from running_config try: base_start_id = int(config.get("start_id", 0) or 0) except (TypeError, ValueError): base_start_id = 0 sem = asyncio.Semaphore(max_concurrent) # Decide whether to use any progress UI. Allow config to forcibly disable it # (e.g., for HumanAgent which reads from stdin and conflicts with live updating UIs). disable_rich_ui = False try: # Accept multiple possible keys to disable rich UI for key in ("disable_rich_ui", "no_rich", "disable_rich"): v = config.get(key) if isinstance(v, str): v_norm = v.strip().lower() if v_norm in ("1", "true", "yes", "y", "on"): # treat truthy strings as True disable_rich_ui = True break elif v: disable_rich_ui = True break except Exception: disable_rich_ui = False # Also disable UI if HumanAgent is used to avoid interfering with stdin try: if getattr(agent_cls, "__name__", "") == "HumanAgent": disable_rich_ui = True except Exception: pass use_rich = False if not disable_rich_ui: try: from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, BarColumn, TaskProgressColumn from rich.console import Group from rich.live import Live from rich.text import Text use_rich = True except Exception: use_rich = False # Common runner utilities ------------------------------------------------- def make_instance_logger(effective_id: int): instance_logger = None if logger is not None: import logging instance_logger_name = f"instance_{effective_id}_{logger.run_id}" instance_logger_obj = logging.getLogger(instance_logger_name) instance_logger_obj.setLevel(logging.INFO) instance_logger_obj.handlers.clear() instance_log_file = Path(logger.get_log_dir()) / f"instance_{effective_id}.log" file_handler = logging.FileHandler(instance_log_file, mode="w", encoding="utf-8") from utils.logger import MultiLineFormatter file_handler.setFormatter(MultiLineFormatter('%(asctime)s - %(levelname)s - %(message)s')) instance_logger_obj.addHandler(file_handler) class InstanceLogger: def __init__(self, logger_obj, main_logger): self.logger = logger_obj self.main_logger = main_logger self.run_id = main_logger.run_id def info(self, message): self.logger.info(message) def warning(self, message): self.logger.warning(message) def error(self, message): self.logger.error(message) def get_log_dir(self): return self.main_logger.get_log_dir() def get_base_dir(self): return self.main_logger.get_base_dir() instance_logger = InstanceLogger(instance_logger_obj, logger) return instance_logger or logger # No-UI branch (for HumanAgent or when explicitly disabled) -------------- if disable_rich_ui: results: List[Dict[str, Any]] = [] for instance_id in range(num_instances): effective_id = base_start_id + instance_id plogger = make_instance_logger(effective_id) agent = create_instance(agent_cls, config, plogger) env = create_instance(env_cls, config, plogger) res = await run_single_instance(agent, env, config, plogger, effective_id) results.append(res) return results # Rich UI branch ---------------------------------------------------------- if use_rich: results: List[Dict[str, Any]] = [] overall_progress = Progress( TextColumn("[bold]Overall[/bold]"), BarColumn(bar_width=None), TaskProgressColumn(), TimeElapsedColumn(), refresh_per_second=8, ) instances_progress = Progress( SpinnerColumn(style="cyan"), TextColumn("[bold]{task.description}[/bold]"), TextColumn("{task.fields[status]}", style="dim"), refresh_per_second=8, ) instance_tasks: Dict[int, int] = {} finished_names: List[str] = [] async def runner(instance_id: int): async with sem: effective_id = base_start_id + instance_id plogger = make_instance_logger(effective_id) agent = create_instance(agent_cls, config, plogger) env = create_instance(env_cls, config, plogger) # Add a per-instance spinner task task_id = instances_progress.add_task(f"instance {effective_id}", status="running") instance_tasks[effective_id] = task_id try: res = await run_single_instance(agent, env, config, plogger, effective_id) overall_progress.update(overall_task, advance=1) # Remove finished task from running list and update Done line try: # Prefer hiding the task to avoid accumulating many visible lines instances_progress.update(task_id, status="done", visible=True) # Immediately hide the finished task for a clean UI instances_progress.update(task_id, visible=False) except Exception: pass try: # Best-effort removal (not strictly required if hidden) instances_progress.remove_task(task_id) except Exception: pass finished_names.append(f"instance {effective_id}") try: done_renderable = Text("✔ Done: ", style="green") if finished_names: done_renderable.append(", ".join(finished_names)) live.update(Group(overall_progress, instances_progress, done_renderable)) except Exception: pass return res finally: # Keep finished tasks displayed; just cleanup mapping instance_tasks.pop(effective_id, None) # Ensure any lingering task is hidden in case of earlier failure try: tid = instance_tasks.get(effective_id) if tid is not None: instances_progress.update(tid, visible=False) except Exception: pass with Live(Group(overall_progress, instances_progress, Text("✔ Done: ", style="green")), refresh_per_second=8, transient=False) as live: overall_task = overall_progress.add_task("Instances", total=num_instances) tasks = [asyncio.create_task(runner(i)) for i in range(num_instances)] raw_results = await asyncio.gather(*tasks, return_exceptions=True) for idx, res in enumerate(raw_results): if isinstance(res, Exception): if logger: logger.error(f"Instance {idx} raised exception: {res}") else: print(f"Instance {idx} raised exception: {res}") results.append({"instance_id": idx, "success": False, "error": str(res)}) else: results.append(res) # type: ignore[arg-type] return results # Fallback tqdm branch ---------------------------------------------------- # Main overall progress bar (position 0) progress_bar = tqdm(total=num_instances, desc="Instances", leave=True) # Allocate fixed display slots for per-instance lightweight spinners slot_queue: asyncio.Queue[int] = asyncio.Queue() for i in range(max_concurrent): slot_queue.put_nowait(i) slot_bars = [ tqdm( total=1, position=1 + i, leave=True, bar_format="{desc} {postfix}", dynamic_ncols=True, ) for i in range(max_concurrent) ] for i, bar in enumerate(slot_bars): bar.set_description_str("[instance -]") bar.set_postfix_str("") active_slots: Dict[int, Dict[str, Any]] = {} stop_spinners = asyncio.Event() async def spinner_updater(): spinner_chars = ["|", "/", "-", "\\"] idx = 0 try: while not stop_spinners.is_set(): for slot, meta in list(active_slots.items()): bar = slot_bars[slot] inst_id = meta.get("id") bar.set_description_str(f"[instance {inst_id}]") bar.set_postfix_str(f"running {spinner_chars[idx % len(spinner_chars)]}") bar.refresh() idx += 1 await asyncio.sleep(0.1) finally: for slot, meta in list(active_slots.items()): bar = slot_bars[slot] inst_id = meta.get("id") bar.set_description_str(f"[instance {inst_id}]") bar.set_postfix_str("done") bar.refresh() async def runner(instance_id: int): async with sem: effective_id = base_start_id + instance_id slot = await slot_queue.get() active_slots[slot] = {"id": effective_id} plogger = make_instance_logger(effective_id) agent = create_instance(agent_cls, config, plogger) env = create_instance(env_cls, config, plogger) try: result = await run_single_instance(agent, env, config, plogger, effective_id) progress_bar.update(1) return result finally: try: bar = slot_bars[slot] bar.set_description_str(f"[instance {effective_id}]") bar.set_postfix_str("done") bar.refresh() except Exception: pass active_slots.pop(slot, None) slot_queue.put_nowait(slot) spinner_task = asyncio.create_task(spinner_updater()) tasks = [asyncio.create_task(runner(i)) for i in range(num_instances)] raw_results = await asyncio.gather(*tasks, return_exceptions=True) stop_spinners.set() try: await spinner_task except Exception: pass progress_bar.close() for bar in slot_bars: try: bar.close() except Exception: pass results: List[Dict[str, Any]] = [] for idx, res in enumerate(raw_results): if isinstance(res, Exception): if logger: logger.error(f"Instance {idx} raised exception: {res}") else: print(f"Instance {idx} raised exception: {res}") results.append({"instance_id": idx, "success": False, "error": str(res)}) else: results.append(res) # type: ignore[arg-type] return results def write_summary(results: List[Dict[str, Any]], output_file: Path): """Write results summary to `output_file`, creating parent dirs.""" total = len(results) successes = sum(1 for r in results if r.get("success")) # Per-task-type aggregation (only if task_type present) by_task: Dict[str, Dict[str, Any]] = {} for r in results: if "task_type" not in r or r.get("task_type") is None: continue task_type = str(r.get("task_type")) bucket = by_task.setdefault(task_type, { "total_instances": 0, "successful_instances": 0, "total_time": 0.0, "total_steps": 0, "total_cost": 0.0, "total_reward": 0.0, }) bucket["total_instances"] += 1 if r.get("success"): bucket["successful_instances"] += 1 bucket["total_time"] += float(r.get("time", 0.0)) bucket["total_steps"] += int(r.get("steps", 0) or 0) bucket["total_cost"] += float(r.get("cost", 0.0)) bucket["total_reward"] += float(r.get("reward", 0.0)) # Compute averages per task for t, b in by_task.items(): ti = b["total_instances"] or 1 b["success_rate"] = b["successful_instances"] / ti b["avg_time_per_instance"] = b["total_time"] / ti b["avg_steps_per_instance"] = b["total_steps"] / ti b["avg_cost_per_instance"] = b["total_cost"] / ti # Dynamically aggregate all numeric-like metrics (totals and averages) # Exclude only 'success' to avoid double counting in metrics, # and exclude non-meaningful fields like instance_id numeric_keys = set() for r in results: for k, v in r.items(): if k in ("instance_id", "success"): continue if isinstance(v, (int, float, bool)): numeric_keys.add(k) metrics_total: Dict[str, float] = {} metrics_avg: Dict[str, float] = {} for k in sorted(numeric_keys): s = 0.0 for r in results: try: val = r.get(k, 0) s += float(val or 0) except Exception: continue metrics_total[k] = s metrics_avg[k] = (s / total) if total > 0 else 0.0 # Per-task dynamic metrics by_task_metrics: Dict[str, Dict[str, Dict[str, float]]] = {} if by_task: for task_type in by_task.keys(): totals: Dict[str, float] = {} avgs: Dict[str, float] = {} bucket_results = [r for r in results if str(r.get("task_type")) == task_type] bucket_n = len(bucket_results) or 1 for k in sorted(numeric_keys): s = 0.0 for r in bucket_results: try: val = r.get(k, 0) s += float(val or 0) except Exception: continue totals[k] = s avgs[k] = s / bucket_n by_task_metrics[task_type] = {"metrics_total": totals, "metrics_avg": avgs} summary = { "summary": { "total_instances": total, "successful_instances": successes, "success_rate": successes / total if total > 0 else 0, "metrics_total": metrics_total, "metrics_avg": metrics_avg, }, "instances": results, } if by_task: # merge base by_task stats with dynamic metrics merged_by_task: Dict[str, Any] = {} for t, base_stats in by_task.items(): merged = dict(base_stats) if t in by_task_metrics: merged.update(by_task_metrics[t]) merged_by_task[t] = merged summary["by_task_type"] = merged_by_task output_file.parent.mkdir(parents=True, exist_ok=True) import json output_file.write_text(json.dumps(summary, indent=2)) print("\n📊 Summary:") rate_pct = (successes / total * 100.0) if total > 0 else 0.0 print(f" Success: {successes}/{total} ({rate_pct:.4f}%)") print(f" Results saved to: {output_file}") # Print per-task breakdown if any if by_task: print(" By task_type:") for t, b in by_task.items(): rpct = b["success_rate"] * 100.0 print(f" - {t}: {b['successful_instances']}/{b['total_instances']} ({rpct:.4f}%)") # Print standard metrics if present standard_keys_order = ["time", "steps", "cost", "reward"] std_present = [k for k in standard_keys_order if k in metrics_total] if std_present: print(" Metrics (totals/avg):") for k in std_present: total_v = metrics_total[k] avg_v = metrics_avg[k] try: print(f" - {k}: total={total_v:.4f}, avg={avg_v:.4f}") except Exception: print(f" - {k}: total={total_v}, avg={avg_v}") # Print any additional numeric metrics not already shown (excluding 'success') excluded_keys = set(std_present) extra_keys = [k for k in metrics_total.keys() if k not in excluded_keys] if extra_keys: print(" Extra metrics (totals/avg):") for k in extra_keys: total_v = metrics_total[k] avg_v = metrics_avg[k] try: print(f" - {k}: total={total_v:.4f}, avg={avg_v:.4f}") except Exception: print(f" - {k}: total={total_v}, avg={avg_v}") def main(): parser = argparse.ArgumentParser( description="Run an agent in an environment", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "-a", "--agent", type=str, default="agents.recode.agent.ReCodeAgent", help="Agent class path or alias. Examples: agents.recode.agent.ReCodeAgent | aliases: human, recode, react, codeact, adaplanner", ) parser.add_argument( "-e", "--env", type=str, default="envs.alfworld.env.AlfworldEnv", help="Environment class path or alias. Examples: envs.alfworld.env.AlfworldEnv | aliases: alfworld, webshop, sciworld, travelplanner", ) parser.add_argument( "-n", "--instances", type=int, default=1, help="Number of instances to run", ) parser.add_argument( "-c", "--concurrent", type=int, default=1, help="Maximum concurrent instances", ) parser.add_argument( "-o", "--output", type=str, default="results.json", help="Results JSON filename (will be saved in logs/ /)", ) parser.add_argument( "-C", "--config", type=str, default=None, help="YAML config file path. Values here override CLI flags.", ) parser.add_argument( "--split", type=str, default="test", help="Dataset split to use (e.g., train/valid/test)", ) parser.add_argument( "--seed", type=int, default=42, help="Random seed forwarded to environments", ) parser.add_argument( "-p", "--profile", type=str, default=None, help="LLM profile name forwarded to the agent", ) parser.add_argument( "-l", "--log-dir", type=str, default=None, help="Custom log directory name (otherwise autogenerated)", ) parser.add_argument( "--max-depth", type=int, default=None, help="Maximum depth for agent execution", ) args = parser.parse_args() try: # Load YAML config (overrides CLI) import yaml yaml_cfg = {} if args.config: try: with open(args.config) as f: yaml_cfg = yaml.safe_load(f) or {} except FileNotFoundError: print(f"⚠️ Config file not found: {args.config}. Using CLI values only.") yaml_cfg = {} # Compose final config: CLI base, YAML overrides cli_cfg: Dict[str, Any] = { "agent": args.agent, "env": args.env, "instances": args.instances, "concurrent": args.concurrent, "output": args.output, "log_dir": args.log_dir, "split": args.split, "seed": args.seed, "profile": args.profile, "max_depth": args.max_depth, } config: Dict[str, Any] = {**cli_cfg, **yaml_cfg} agent_path: str = config.get("agent", args.agent) env_path: str = config.get("env", args.env) instances: int = int(config.get("instances", args.instances) or 1) concurrent: int = int(config.get("concurrent", args.concurrent) or 1) output_name: str = str(config.get("output", args.output)) # Resolve short aliases if provided agent_path = resolve_class_identifier(agent_path, AGENT_ALIASES, "agent") env_path = resolve_class_identifier(env_path, ENV_ALIASES, "env") agent_cls = load_class(agent_path) env_cls = load_class(env_path) # Use class names for default run_id for readability run_id = config.get("log_dir") or _default_run_id(agent_cls.__name__, env_cls.__name__) # Clear existing log directory if present existing_base_dir = Path("logs") / run_id if existing_base_dir.exists(): try: shutil.rmtree(existing_base_dir) except Exception as e: print(f"⚠️ Failed to clear existing log directory: {existing_base_dir} ({e})") logger = SimpleLogger(run_id=run_id) # Special handling for HumanAgent: disable Rich UI and force concurrency to 1 is_human_agent = (getattr(agent_cls, "__name__", "") == "HumanAgent") or agent_path.endswith(".HumanAgent") if is_human_agent: if concurrent != 1: logger.info(f"Human agent detected. Forcing max concurrent to 1 (was {concurrent}).") concurrent = 1 config["concurrent"] = 1 config["disable_rich_ui"] = True logger.info(f"🤖 Agent: {agent_path}") logger.info(f"🌍 Environment: {env_path}") logger.info(f"📊 Instances: {instances} (max {concurrent} concurrent)") logger.info("-" * 50) results = asyncio.run( run_concurrent_instances(agent_cls, env_cls, instances, concurrent, config, logger) ) output_file = logger.get_base_dir() / output_name write_summary(results, output_file) except KeyboardInterrupt: print("\n⏹️ Interrupted by user") except Exception as e: # import traceback # traceback.print_exc() print(f"\n❌ Error: {e}") return 1 return 0 if __name__ == "__main__": exit(main()) ================================================ FILE: utils/common.py ================================================ import json import yaml from pathlib import Path from typing import Any, List, Dict, Optional from pydantic_core import to_jsonable_python import re def read_json_file(json_file: str, encoding="utf-8") -> List[Any]: if not Path(json_file).exists(): raise FileNotFoundError(f"json_file: {json_file} not exist, return []") with open(json_file, "r", encoding=encoding) as fin: try: data = json.load(fin) except Exception: raise ValueError(f"read json file: {json_file} failed") return data def write_json_file(json_file: str, data: list, encoding: str = None, indent: int = 4): folder_path = Path(json_file).parent if not folder_path.exists(): folder_path.mkdir(parents=True, exist_ok=True) with open(json_file, "w", encoding=encoding) as fout: json.dump(data, fout, ensure_ascii=False, indent=indent, default=to_jsonable_python) def read_yaml_file(yaml_file: str, encoding='utf-8') -> Dict[str, Any]: if not Path(yaml_file).exists(): raise FileNotFoundError(f"yaml_file: {yaml_file} not exist, return empty dict") with open(yaml_file, "r", encoding=encoding) as f: try: data = yaml.safe_load(f) except Exception: raise ValueError(f"read yaml file: {yaml_file} failed") return data def parse_code_block(text: str, lang: str = "python") -> Optional[str]: """Extracts the first code block of a given language from a markdown-formatted text.""" pattern = rf"```{lang}\s*\n(.*?)\n```" match = re.search(pattern, text, re.DOTALL) if match: return match.group(1).strip() return None def parse_xml_tag(response: str, xml_tag: str) -> str: pattern = rf"<{xml_tag}>(.*?){xml_tag}>" match = re.search(pattern, response, re.DOTALL) return match.group(1).strip() if match else "" ================================================ FILE: utils/errors.py ================================================ class StepLimitError(Exception): """Raised when the environment exceeds the maximum allowed step count.""" pass ================================================ FILE: utils/executor.py ================================================ from typing import List, Dict, Any, Callable import io, sys import functools import asyncio import types import re import threading import time from utils.llm import AsyncLLM from base.environment import Env def print_output(func): @functools.wraps(func) def wrapper(*args, **kwargs): result = func(*args, **kwargs) if result is not None: print(result, file=sys.stdout, flush=True) return result return wrapper class Executor: def __init__(self, env: Env = None, if_run_print: bool = False) -> None: self.env = env self.actions: List[str] = [] self._variables: Dict[str, Any] = {} self.if_run_print = if_run_print if self.if_run_print: self.run = print_output(self.run) self._base_globals = { "run": self.run, "re": re, } self._loop = None self._loop_thread = None self._start_loop_thread() def register_function(self, name: str, func: Callable): self._base_globals[name] = func def register_action_function(self, name: str, func: Callable): func_with_run = lambda *args, **kwargs: self.run(func(*args, **kwargs)) self.register_function(name, func_with_run) def register_ask_llm(self, llm: AsyncLLM): def _ask_llm_sync(query: str) -> str: async def _ask_llm(query: str) -> str: response, _cost = await llm( prompt=query, ) return response return self._submit_coro(_ask_llm(query)) self.register_function("ask_llm", _ask_llm_sync) def skip(self, reason: str): return None def set_var(self, key: str, value: Any): self._variables[key] = value def get_var(self, key: str) -> Any: if key not in self._variables: return None return self._variables.get(key) def set_env(self, env: Env): self.env = env def _is_preserved_variable(self, key: str, value: Any) -> bool: if key.startswith('_') or key in self._base_globals: return False return not isinstance(value, (types.ModuleType, types.FunctionType, types.BuiltinFunctionType, types.MethodType, type)) def _infer_type_string(self, value: Any, depth: int = 0, max_depth: int = 2) -> str: if value is None: return "NoneType" if depth > max_depth: return type(value).__name__ try: if isinstance(value, (bool, int, float, str)): return type(value).__name__ if isinstance(value, list): if not value: return "list" elem_types = {self._infer_type_string(v, depth + 1, max_depth) for v in value[:5]} if len(elem_types) == 1: return f"list[{next(iter(elem_types))}]" return "list" if isinstance(value, tuple): if not value: return "tuple" elem_types = [self._infer_type_string(v, depth + 1, max_depth) for v in value[:5]] if all(t == elem_types[0] for t in elem_types): return f"tuple[{elem_types[0]}]" return f"tuple[{', '.join(elem_types)}]" if isinstance(value, set): if not value: return "set" sample = list(value)[:5] elem_types = {self._infer_type_string(v, depth + 1, max_depth) for v in sample} if len(elem_types) == 1: return f"set[{next(iter(elem_types))}]" return "set" if isinstance(value, dict): if not value: return "dict" items = list(value.items())[:5] key_types = {self._infer_type_string(k, depth + 1, max_depth) for k, _ in items} val_types = {self._infer_type_string(v, depth + 1, max_depth) for _, v in items} if len(key_types) == 1 and len(val_types) == 1: return f"dict[{next(iter(key_types))}, {next(iter(val_types))}]" return "dict" return type(value).__name__ except Exception: return type(value).__name__ def run(self, action: str) -> str: if self.env is None: raise RuntimeError("Environment not set. Call set_env() first.") result = self._submit_coro(self.env.run(action)) self.actions.append(action) if isinstance(result, list): result = "\n".join(result) return result def get_actions(self) -> List[str]: actions = self.actions.copy() self.actions.clear() return actions def get_variables(self) -> str: return "\n".join([f"- {key} ({self._infer_type_string(value)}): {value}" for key, value in self._variables.items()]) def reset(self): self.actions.clear() self._variables.clear() def _start_loop_thread(self): if self._loop and self._loop.is_running(): return def _loop_runner(): loop = asyncio.new_event_loop() self._loop = loop asyncio.set_event_loop(loop) loop.run_forever() t = threading.Thread(target=_loop_runner, daemon=True) t.start() while self._loop is None or not self._loop.is_running(): time.sleep(0.01) self._loop_thread = t def _submit_coro(self, coro): self._start_loop_thread() future = asyncio.run_coroutine_threadsafe(coro, self._loop) return future.result() def close(self): if self._loop and self._loop.is_running(): try: self._loop.call_soon_threadsafe(self._loop.stop) except Exception: pass if self._loop_thread: self._loop_thread.join(timeout=1) self._loop = None self._loop_thread = None def execute(self, code: str) -> Dict[str, Any]: success, stdout_lines, error_msg = self._run_block(code) return {"code": code, "stdout": stdout_lines, "error": error_msg, "success": success} def _run_block(self, block: str) -> tuple[bool, List[str], str]: output = [] class OutputCapture: def __init__(self): self.lines = [] def write(self, text): if text and text != '\n': self.lines.extend(line for line in text.splitlines() if line.strip()) def flush(self): pass capture = OutputCapture() old_stdout = sys.stdout sys.stdout = capture exec_globals = {**self._base_globals, **self._variables} try: exec(block, exec_globals) for key, value in exec_globals.items(): if self._is_preserved_variable(key, value): self._variables[key] = value return True, capture.lines, "" except NameError as e: match = re.search(r"name '(.+?)' is not defined", str(e)) if match and f"{match.group(1)}(" in block: return False, capture.lines, f"NeedExpansion: `{match.group(1)}` needs to be expanded." return False, capture.lines, f"NameError: {e}" except Exception as e: return False, capture.lines, f"{e.__class__.__name__}: {e}" finally: sys.stdout = old_stdout ================================================ FILE: utils/llm.py ================================================ import os import asyncio import yaml import random from pathlib import Path from typing import Optional, Dict, Any, Tuple, Union, List from openai import AsyncOpenAI, APIError, APIConnectionError, APITimeoutError, RateLimitError from pydantic import BaseModel, Field, model_validator, ConfigDict from utils.common import read_json_file DEFAULT_LLM_PROFILE_PATH = Path("configs/profiles.yaml") DEFAULT_PRICE_PATH = Path("configs/prices.json") class LLMConfig(BaseModel): model_config = ConfigDict(extra="forbid", validate_default=True) api_key: Optional[str] = Field( default=None, description="OpenAI API key (defaults to OPENAI_API_KEY environment variable)" ) base_url: Optional[str] = Field( default=None, description="Custom API base URL for OpenAI-compatible endpoints" ) model: str = Field(default="gpt-4o-mini", description="Model name to use") temperature: Optional[float] = Field( default=None, ge=0.0, le=2.0, description="Sampling temperature (omit by default; excluded for o-series)" ) max_tokens: Optional[int] = Field( default=None, gt=0, description="Maximum number of tokens to generate (omit by default)" ) timeout: int = Field(default=60, gt=0, description="API request timeout in seconds") max_retries: int = Field(default=3, ge=0, description="Maximum retry attempts") retry_base_delay: float = Field(default=1.0, description="Base retry delay in seconds") retry_jitter: float = Field(default=0.1, description="Retry delay jitter factor") track_costs: bool = Field(default=True, description="Enable cost tracking") @classmethod def from_profile(cls, profile: str = "default", config_path: Path = DEFAULT_LLM_PROFILE_PATH) -> "LLMConfig": try: with config_path.open("r", encoding="utf-8") as f: config = yaml.safe_load(f) profile_config = config.get("models", {}).get(profile, {}) return cls(**{ k: v for k, v in profile_config.items() if v is not None and k in cls.model_fields }) except Exception as e: return cls() @model_validator(mode="after") def resolve_api_key(self) -> "LLMConfig": if not self.api_key: self.api_key = os.environ.get("OPENAI_API_KEY") return self class CostCalculator(BaseModel): pricing: Dict[str, Dict[str, float]] = Field( default_factory=lambda: read_json_file(DEFAULT_PRICE_PATH), description="Pricing data in USD per million tokens" ) def compute_cost( self, model: str, prompt_tokens: int, completion_tokens: int ) -> Tuple[float, Dict[str, Any]]: rates = self.pricing.get(model, self.pricing["default"]) input_cost = (prompt_tokens / 1e6) * rates["input"] output_cost = (completion_tokens / 1e6) * rates["output"] total_cost = input_cost + output_cost cost_breakdown = { "model": model, "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens, "input_cost": input_cost, "output_cost": output_cost, "total_cost": total_cost, "currency": "USD" } return total_cost, cost_breakdown class AsyncLLM(BaseModel): config: LLMConfig = Field(default_factory=LLMConfig) cost_calculator: CostCalculator = Field(default_factory=CostCalculator) client: Optional[AsyncOpenAI] = Field(default=None, exclude=True) spent: float = Field(default=0.0, description="Total accumulated cost for this instance") model_config = ConfigDict(arbitrary_types_allowed=True) def __init__(self, profile_or_config: Union[str, Dict[str, Any]] = "default", **kwargs): if isinstance(profile_or_config, str): config = self._load_profile_config(profile_or_config) config.update({k: v for k, v in kwargs.items() if k in LLMConfig.model_fields}) super().__init__(config=LLMConfig(**config)) else: config_kwargs = profile_or_config if isinstance(profile_or_config, dict) else {} config_kwargs.update({k: v for k, v in kwargs.items() if k in LLMConfig.model_fields}) super().__init__(config=LLMConfig(**config_kwargs)) self._initialize_client() def _load_profile_config(self, profile: str) -> Dict[str, Any]: try: config_path = DEFAULT_LLM_PROFILE_PATH with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) if profile in config.get("models", {}): return config["models"][profile] elif profile in config.get("llm_pool", {}): pool_config = config["llm_pool"][profile] return {k: v for k, v in pool_config.items() if k in LLMConfig.model_fields} else: return {} except Exception as e: return {} def _initialize_client(self) -> None: if not self.config.api_key: self.config.api_key = os.environ.get("OPENAI_API_KEY") if not self.config.api_key: raise ValueError("Missing required API key. Set OPENAI_API_KEY environment variable.") client_args = { "api_key": self.config.api_key, "timeout": self.config.timeout } if self.config.base_url: client_args["base_url"] = self.config.base_url self.client = AsyncOpenAI(**client_args) async def __call__( self, prompt: str, system_prompt: Optional[str] = None, **generation_args ) -> Tuple[str, float]: messages = self._build_messages(prompt, system_prompt) params = self._prepare_params(messages, generation_args) response = await self._retry_api_call(params) content = response.choices[0].message.content cost = 0.0 if self.config.track_costs and (usage := getattr(response, "usage", None)): cost, _ = self.cost_calculator.compute_cost( response.model, usage.prompt_tokens, usage.completion_tokens ) self.spent += cost return content, cost def _build_messages(self, prompt: str, system_prompt: Optional[str]) -> List[Dict[str, str]]: messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) return messages def _prepare_params( self, messages: list[Dict[str, str]], generation_args: Dict[str, Any] ) -> Dict[str, Any]: params: Dict[str, Any] = { "model": self.config.model, "messages": messages, } model_name = (self.config.model or "").lower() is_o_series = model_name.startswith("o") if (self.config.temperature is not None) and (not is_o_series): params["temperature"] = self.config.temperature if self.config.max_tokens is not None: params["max_tokens"] = self.config.max_tokens safe_generation_args = dict(generation_args) if generation_args else {} if is_o_series: safe_generation_args.pop("temperature", None) params.update(safe_generation_args) return params async def _retry_api_call(self, params: Dict[str, Any]) -> Any: for attempt in range(self.config.max_retries + 1): try: return await self.client.chat.completions.create(**params) except (APIError, APIConnectionError, APITimeoutError, RateLimitError) as e: if attempt == self.config.max_retries: raise backoff_time = self._calculate_backoff( attempt, self.config.retry_base_delay, self.config.timeout ) await asyncio.sleep(backoff_time) def _calculate_backoff(self, attempt: int, base: float, max_wait: float) -> float: delay = base * (2 ** attempt) jitter = delay * self.config.retry_jitter * random.uniform(-1, 1) return min(delay + jitter, max_wait) def create_llm_instance(model_name: str) -> AsyncLLM: return AsyncLLM(profile_or_config=model_name) async def main(): try: llm = AsyncLLM("default") prompt = "Hello, what is the capital of France?" response, cost = await llm(prompt) print("Response:", response) print("Cost:", cost) except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: utils/logger.py ================================================ import os import logging from datetime import datetime from pathlib import Path class MultiLineFormatter(logging.Formatter): def format(self, record): msg = super().format(record) lines = msg.split('\n') if len(lines) <= 1: return msg return '\n'.join(lines) class SimpleLogger: def __init__(self, run_id=None, log_level=logging.INFO): if run_id is None: run_id = datetime.now().strftime("%Y%m%d_%H%M%S") self.run_id = run_id self.base_dir = Path("logs") / run_id self.log_dir = self.base_dir / "running_logs" self.log_dir.mkdir(parents=True, exist_ok=True) sanitized_run_id = run_id.replace("/", "_").replace("\\", "_") self.logger = logging.getLogger(f"alfworld_run_{sanitized_run_id}") self.logger.setLevel(log_level) self.logger.handlers.clear() log_file = self.log_dir / "run.log" file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8') file_handler.setLevel(log_level) console_handler = logging.StreamHandler() console_handler.setLevel(log_level) formatter = MultiLineFormatter('%(asctime)s - %(levelname)s - %(message)s') file_handler.setFormatter(formatter) console_handler.setFormatter(formatter) self.logger.addHandler(file_handler) self.logger.addHandler(console_handler) self.info(f"Starting new run with ID: {run_id}") self.info(f"Logs will be saved to: {self.log_dir.absolute()}") def info(self, message): self.logger.info(message) def error(self, message): self.logger.error(message) def warning(self, message): self.logger.warning(message) def debug(self, message): self.logger.debug(message) def log_result(self, result): task_id = result.get("task_id", "unknown") success = result.get("both_success", result.get("is_success", False)) exec_time = result.get("execution_time", result.get("time", 0)) game_name = result.get("game_name", "") status = "SUCCESS" if success else "FAILED" self.info(f"[{status}] {task_id} - {game_name} - {exec_time:.2f}s") if "error" in result: self.error(f"Error in {task_id}: {result['error']}") def log_stats(self, stats): self.info("=" * 50) self.info("RUN STATISTICS") self.info("=" * 50) self.info(f"Total tests: {stats['total_tests']}") self.info(f"Successful: {stats['successful_tests']}") self.info(f"Success rate: {stats['success_rate']:.1%}") self.info(f"Average execution time: {stats['average_execution_time']:.2f}s") if stats.get('task_types'): self.info("\nSuccess rate by task type:") for task_type, type_stats in stats['task_types'].items(): rate = type_stats['rate'] total = type_stats['total'] success = type_stats['success'] self.info(f" {task_type}: {success}/{total} ({rate:.1%})") def get_log_dir(self): return self.log_dir def get_base_dir(self): return self.base_dir ================================================ FILE: utils/mockllm.py ================================================ import asyncio class MockLLM: def __init__(self, name="MockLLM"): self.name = name async def __call__(self, prompt): print(f"\n--- {self.name} Prompt ---") print(prompt) print(f"\n--- Please provide your response (enter an empty line to finish) ---") lines = [] while True: line = input() if line.strip() == "": break lines.append(line) return "\n".join(lines) async def test_mock_llm(): mock_llm = MockLLM(name="TestLLM") prompts = [ "What is the capital of France?", "Write a short poem about artificial intelligence." ] for i, prompt in enumerate(prompts, 1): print(f"\nTest {i}:") response = await mock_llm(prompt) print("\nYour response was:") print("-" * 40) print(response) print("-" * 40) print("\nTest completed successfully!") if __name__ == "__main__": asyncio.run(test_mock_llm())