Repository: gravelBridge/AutoGPT-Web-Interaction Branch: main Commit: e47203d67f20 Files: 5 Total size: 26.0 KB Directory structure: gitextract_uqdpmwio/ ├── LICENSE ├── README.md ├── requirements.txt └── web_interaction/ ├── __init__.py └── web_interaction.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 gravelBridge Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # AutoGPT Web Interaction Plugin ![Screenshot 2023-05-01 at 7 37 16 PM](https://user-images.githubusercontent.com/107640947/235567612-0fd49909-197c-4ebf-9f7f-8edf1bf4d7d0.png) The AutoGPT Web Interaction Plugin enables Auto-GPT to interact with websites. Note: The plugin is very flakey on GPT-3.5, I recommend using GPT-4. However, it can still perform basic tasks on GPT-3.5. ## Key Features: - Allows Auto-GPT to click elements. - Allows Auto-GPT to type text. - Allows Auto-GPT select elements. - Allows Auto-GPT to scroll ## Installation Follow these steps to configure the Auto-GPT Email Plugin: ### 1. Clone this repository. ### 2. cd into the directory, and run pip install -r requirements.txt ### 3. pip install playwright ### 3. Zip/Compress the web_interaction folder ### 4. Drag the new zip file into the Auto-GPT plugins folder. ### 5. Set `ALLOWLISTED_PLUGINS=AutoGPTWebInteraction,example-plugin1,example-plugin2,etc` in your AutoGPT `.env` file. ### 6. Edit goals When using Auto-GPT please set one of the goals to "Remember to use the Web Interaction Plugin possible". ================================================ FILE: requirements.txt ================================================ playwright colorama typing ================================================ FILE: web_interaction/__init__.py ================================================ """This is the web interaction plugin for Auto-GPT.""" from typing import Any, Dict, List, Optional, Tuple, TypeVar, TypedDict from auto_gpt_plugin_template import AutoGPTPluginTemplate from colorama import Fore PromptGenerator = TypeVar("PromptGenerator") class Message(TypedDict): role: str content: str class AutoGPTWebInteraction(AutoGPTPluginTemplate): """ This is the Auto-GPT web interaction plugin. """ def __init__(self): super().__init__() self._name = "Auto-Web-Interaction-Plugin" self._version = "0.1.0" self._description = "Auto-GPT Web Interaction Plugin: Interact with websites." def post_prompt(self, prompt: PromptGenerator) -> PromptGenerator: from .web_interaction import ( start_browser, go_to_page, scroll, click, type, enter, crawl, type_and_enter, get_current_url, ) prompt.add_resource(""" Ability to interact with websites via the web_interaction plugin. Information for the web_interaction plugin: The format of the browser content is highly simplified; all formatting elements are stripped. Interactive elements such as links, inputs, buttons are represented like this: text text Images are rendered as their alt text like this: Don't try to interact with elements that you can't see. CRITICAL: The id parameter specified in , text, etc.. MUST be used for all web_interaction commands that require an id. CRITICAL: Use the command get_dom every time before executing any web_interaction plugin command. CRITICAL: When trying to search something on Google, don't call the dom function. Instead, just use the id value of 3 """) prompt.add_command("start_browser", "Starts the browser for web interaction. Must be ran before attempting to perform any other web interaction plugins.", {}, start_browser) prompt.add_command("go_to_website", "Goes to a website in the web interaction plugin. Must be ran after starting the browser and before attempting to interact with a website.", {"url":""}, go_to_page) prompt.add_command("get_dom", "Returns a simplified DOM of the current web page. The id is specific to this plugin and will be needed to interact with elements. Make sure to run this before interacting with any elements on a webpage. Re-run this each time you're on a new webpage and want to interact with elements.", {}, crawl) prompt.add_command("click_element_by_id", "Clicks an element. Specify the id with the unique id received from the get_dom command. CRITICAL: The ID must be the integer id from the get_dom command.", {"id":""}, click) prompt.add_command("input_text_by_id", "Inputs text to an element. Specify the id with the unique id received from the get_dom command. CRITICAL: The ID must be the integer id from the get_dom command.", {"id":"", "text":""}, type) prompt.add_command("input_text_by_id_and_press_enter", "Inputs text to an element. Specify the id with the unique id received from the get_dom command. Also presses enter after finishing inputting text. CRITICAL: The ID must be the integer id from the get_dom command.", {"id":"", "text":""}, type_and_enter) prompt.add_command("scroll", "Scrolls the current website up or down one page. In the arguments, use either \"up\" or \"down\"", {"direction":""}, scroll) prompt.add_command("enter", "Presses enter on the keyboard on the current website.", {}, enter) prompt.add_command("get_url", "Retrieves the current url that the web_interaction plugin is on.", {}, get_current_url) return prompt def can_handle_post_prompt(self) -> bool: """This method is called to check that the plugin can handle the post_prompt method. Returns: bool: True if the plugin can handle the post_prompt method.""" return True def can_handle_on_response(self) -> bool: """This method is called to check that the plugin can handle the on_response method. Returns: bool: True if the plugin can handle the on_response method.""" return False def on_response(self, response: str, *args, **kwargs) -> str: """This method is called when a response is received from the model.""" pass def can_handle_on_planning(self) -> bool: """This method is called to check that the plugin can handle the on_planning method. Returns: bool: True if the plugin can handle the on_planning method.""" return False def on_planning( self, prompt: PromptGenerator, messages: List[Message] ) -> Optional[str]: """This method is called before the planning chat completion is done. Args: prompt (PromptGenerator): The prompt generator. messages (List[str]): The list of messages. """ pass def can_handle_post_planning(self) -> bool: """This method is called to check that the plugin can handle the post_planning method. Returns: bool: True if the plugin can handle the post_planning method.""" return False def post_planning(self, response: str) -> str: """This method is called after the planning chat completion is done. Args: response (str): The response. Returns: str: The resulting response. """ pass def can_handle_pre_instruction(self) -> bool: """This method is called to check that the plugin can handle the pre_instruction method. Returns: bool: True if the plugin can handle the pre_instruction method.""" return False def pre_instruction(self, messages: List[Message]) -> List[Message]: """This method is called before the instruction chat is done. Args: messages (List[Message]): The list of context messages. Returns: List[Message]: The resulting list of messages. """ pass def can_handle_on_instruction(self) -> bool: """This method is called to check that the plugin can handle the on_instruction method. Returns: bool: True if the plugin can handle the on_instruction method.""" return False def on_instruction(self, messages: List[Message]) -> Optional[str]: """This method is called when the instruction chat is done. Args: messages (List[Message]): The list of context messages. Returns: Optional[str]: The resulting message. """ pass def can_handle_post_instruction(self) -> bool: """This method is called to check that the plugin can handle the post_instruction method. Returns: bool: True if the plugin can handle the post_instruction method.""" return False def post_instruction(self, response: str) -> str: """This method is called after the instruction chat is done. Args: response (str): The response. Returns: str: The resulting response. """ pass def can_handle_pre_command(self) -> bool: """This method is called to check that the plugin can handle the pre_command method. Returns: bool: True if the plugin can handle the pre_command method.""" return False def pre_command( self, command_name: str, arguments: Dict[str, Any] ) -> Tuple[str, Dict[str, Any]]: """This method is called before the command is executed. Args: command_name (str): The command name. arguments (Dict[str, Any]): The arguments. Returns: Tuple[str, Dict[str, Any]]: The command name and the arguments. """ pass def can_handle_post_command(self) -> bool: """This method is called to check that the plugin can handle the post_command method. Returns: bool: True if the plugin can handle the post_command method.""" return False def post_command(self, command_name: str, response: str) -> str: """This method is called after the command is executed. Args: command_name (str): The command name. response (str): The response. Returns: str: The resulting response. """ pass def can_handle_chat_completion( self, messages: Dict[Any, Any], model: str, temperature: float, max_tokens: int ) -> bool: """This method is called to check that the plugin can handle the chat_completion method. Args: messages (List[Message]): The messages. model (str): The model name. temperature (float): The temperature. max_tokens (int): The max tokens. Returns: bool: True if the plugin can handle the chat_completion method.""" return False def handle_chat_completion( self, messages: List[Message], model: str, temperature: float, max_tokens: int ) -> str: """This method is called when the chat completion is done. Args: messages (List[Message]): The messages. model (str): The model name. temperature (float): The temperature. max_tokens (int): The max tokens. Returns: str: The resulting response. """ pass def can_handle_text_embedding( self, text: str ) -> bool: return False def handle_text_embedding( self, text: str ) -> list: pass def can_handle_user_input(self, user_input: str) -> bool: return False def user_input(self, user_input: str) -> str: return user_input def can_handle_report(self) -> bool: return False def report(self, message: str) -> None: pass ================================================ FILE: web_interaction/web_interaction.py ================================================ from playwright.sync_api import sync_playwright from sys import argv, exit, platform black_listed_elements = set(["html", "head", "title", "meta", "iframe", "body", "style", "script", "path", "svg", "br", "::marker",]) def start_browser(): global browser global page browser = ( sync_playwright() .start() .chromium.launch( headless=False, ) ) page = browser.new_page() page.set_viewport_size({"width": 1280, "height": 1080}) return "Browser successfully started!" def go_to_page(url): global client global page_element_buffer try: page.goto(url=url if "://" in url else "http://" + url) client = page.context.new_cdp_session(page) page_element_buffer = {} except: return "Failed to go to url, please try again and make sure the url is correct." return "Now on url " + url def scroll(direction): if direction == "up": page.evaluate( "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;" ) return "Scrolled!" elif direction == "down": page.evaluate( "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;" ) return "Scrolled!" else: return "Scroll direction invalid." def click(id): # Inject javascript into the page which removes the target= attribute from all links js = """ links = document.getElementsByTagName("a"); for (var i = 0; i < links.length; i++) { links[i].removeAttribute("target"); } """ page.evaluate(js) element = page_element_buffer.get(int(id)) if element: x = element.get("center_x") y = element.get("center_y") page.mouse.click(x, y) else: return "Could not find element" return "Successfully clicked!" def type(id, text): click(int(id)) page.keyboard.type(text) return "Typed " + text + " into " + id def enter(): page.keyboard.press("Enter") return "Pressed enter!" def type_and_enter(id, text): click(int(id)) page.keyboard.type(text) page.keyboard.press("Enter") return "Inputted text, and pressed enter!" def get_current_url(): try: current_url = page.url return current_url except: return "Error retrieving current URL." def crawl(): page_state_as_text = [] device_pixel_ratio = page.evaluate("window.devicePixelRatio") if platform == "darwin" and device_pixel_ratio == 1: # lies device_pixel_ratio = 2 win_scroll_x = page.evaluate("window.scrollX") win_scroll_y = page.evaluate("window.scrollY") win_upper_bound = page.evaluate("window.pageYOffset") win_left_bound = page.evaluate("window.pageXOffset") win_width = page.evaluate("window.screen.width") win_height = page.evaluate("window.screen.height") win_right_bound = win_left_bound + win_width win_lower_bound = win_upper_bound + win_height document_offset_height = page.evaluate("document.body.offsetHeight") document_scroll_height = page.evaluate("document.body.scrollHeight") # percentage_progress_start = (win_upper_bound / document_scroll_height) * 100 # percentage_progress_end = ( # (win_height + win_upper_bound) / document_scroll_height # ) * 100 percentage_progress_start = 1 percentage_progress_end = 2 page_state_as_text.append( { "x": 0, "y": 0, "text": "[scrollbar {:0.2f}-{:0.2f}%]".format( round(percentage_progress_start, 2), round(percentage_progress_end) ), } ) tree = client.send( "DOMSnapshot.captureSnapshot", {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True}, ) strings = tree["strings"] document = tree["documents"][0] nodes = document["nodes"] backend_node_id = nodes["backendNodeId"] attributes = nodes["attributes"] node_value = nodes["nodeValue"] parent = nodes["parentIndex"] node_types = nodes["nodeType"] node_names = nodes["nodeName"] is_clickable = set(nodes["isClickable"]["index"]) text_value = nodes["textValue"] text_value_index = text_value["index"] text_value_values = text_value["value"] input_value = nodes["inputValue"] input_value_index = input_value["index"] input_value_values = input_value["value"] input_checked = nodes["inputChecked"] layout = document["layout"] layout_node_index = layout["nodeIndex"] bounds = layout["bounds"] cursor = 0 html_elements_text = [] child_nodes = {} elements_in_view_port = [] anchor_ancestry = {"-1": (False, None)} button_ancestry = {"-1": (False, None)} def convert_name(node_name, has_click_handler): if node_name == "a": return "link" if node_name == "input" or node_name == "textarea": return "input" if node_name == "img": return "img" if ( node_name == "button" or has_click_handler ): # found pages that needed this quirk return "button" else: return "text" def find_attributes(attributes, keys): values = {} for [key_index, value_index] in zip(*(iter(attributes),) * 2): if value_index < 0: continue key = strings[key_index] value = strings[value_index] if key in keys: values[key] = value keys.remove(key) if not keys: return values return values def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id): parent_id_str = str(parent_id) if not parent_id_str in hash_tree: parent_name = strings[node_names[parent_id]].lower() grand_parent_id = parent[parent_id] add_to_hash_tree( hash_tree, tag, parent_id, parent_name, grand_parent_id ) is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str] # even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Self if node_name == tag: value = (True, node_id) elif ( is_parent_desc_anchor ): # reuse the parent's anchor_id (which could be much higher in the tree) value = (True, anchor_id) else: value = ( False, None, ) # not a descendant of an anchor, most likely it will become text, an interactive element or discarded hash_tree[str(node_id)] = value return value for index, node_name_index in enumerate(node_names): node_parent = parent[index] node_name = strings[node_name_index].lower() is_ancestor_of_anchor, anchor_id = add_to_hash_tree( anchor_ancestry, "a", index, node_name, node_parent ) is_ancestor_of_button, button_id = add_to_hash_tree( button_ancestry, "button", index, node_name, node_parent ) try: cursor = layout_node_index.index( index ) # todo replace this with proper cursoring, ignoring the fact this is O(n^2) for the moment except: continue if node_name in black_listed_elements: continue [x, y, width, height] = bounds[cursor] x /= device_pixel_ratio y /= device_pixel_ratio width /= device_pixel_ratio height /= device_pixel_ratio elem_left_bound = x elem_top_bound = y elem_right_bound = x + width elem_lower_bound = y + height partially_is_in_viewport = ( elem_left_bound < win_right_bound and elem_right_bound >= win_left_bound and elem_top_bound < win_lower_bound and elem_lower_bound >= win_upper_bound ) if not partially_is_in_viewport: continue meta_data = [] # inefficient to grab the same set of keys for kinds of objects but its fine for now element_attributes = find_attributes( attributes[index], ["type", "placeholder", "aria-label", "title", "alt"] ) ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button ancestor_node_key = ( None if not ancestor_exception else str(anchor_id) if is_ancestor_of_anchor else str(button_id) ) ancestor_node = ( None if not ancestor_exception else child_nodes.setdefault(str(ancestor_node_key), []) ) if node_name == "#text" and ancestor_exception: text = strings[node_value[index]] if text == "|" or text == "•": continue ancestor_node.append({ "type": "type", "value": text }) else: if ( node_name == "input" and element_attributes.get("type") == "submit" ) or node_name == "button": node_name = "button" element_attributes.pop( "type", None ) # prevent [button ... (button)..] for key in element_attributes: if ancestor_exception: ancestor_node.append({ "type": "attribute", "key": key, "value": element_attributes[key] }) else: meta_data.append(element_attributes[key]) element_node_value = None if node_value[index] >= 0: element_node_value = strings[node_value[index]] if element_node_value == "|": #commonly used as a seperator, does not add much context - lets save ourselves some token space continue elif ( node_name == "input" and index in input_value_index and element_node_value is None ): node_input_text_index = input_value_index.index(index) text_index = input_value_values[node_input_text_index] if node_input_text_index >= 0 and text_index >= 0: element_node_value = strings[text_index] # remove redudant elements if ancestor_exception and (node_name != "a" and node_name != "button"): continue elements_in_view_port.append( { "node_index": str(index), "backend_node_id": backend_node_id[index], "node_name": node_name, "node_value": element_node_value, "node_meta": meta_data, "is_clickable": index in is_clickable, "origin_x": int(x), "origin_y": int(y), "center_x": int(x + (width / 2)), "center_y": int(y + (height / 2)), } ) # lets filter further to remove anything that does not hold any text nor has click handlers + merge text from leaf#text nodes with the parent elements_of_interest= [] id_counter = 0 for element in elements_in_view_port: node_index = element.get("node_index") node_name = element.get("node_name") node_value = element.get("node_value") is_clickable = element.get("is_clickable") origin_x = element.get("origin_x") origin_y = element.get("origin_y") center_x = element.get("center_x") center_y = element.get("center_y") meta_data = element.get("node_meta") inner_text = f"{node_value} " if node_value else "" meta = "" if node_index in child_nodes: for child in child_nodes.get(node_index): entry_type = child.get('type') entry_value= child.get('value') if entry_type == "attribute": entry_key = child.get('key') meta_data.append(f'{entry_key}="{entry_value}"') else: inner_text += f"{entry_value} " if meta_data: meta_string = " ".join(meta_data) meta = f" {meta_string}" if inner_text != "": inner_text = f"{inner_text.strip()}" converted_node_name = convert_name(node_name, is_clickable) # not very elegant, more like a placeholder if ( (converted_node_name != "button" or meta == "") and converted_node_name != "link" and converted_node_name != "input" and converted_node_name != "img" and converted_node_name != "textarea" ) and inner_text.strip() == "": continue page_element_buffer[id_counter] = element if inner_text != "": elements_of_interest.append( f"""<{converted_node_name} id={id_counter}{meta}>{inner_text}""" ) else: elements_of_interest.append( f"""<{converted_node_name} id={id_counter}{meta}/>""" ) id_counter += 1 if len(elements_of_interest) > 125: idCounter =0 divided_elements_of_interest = [] while idCounter <= 125: divided_elements_of_interest.append(elements_of_interest[idCounter]) idCounter+=1 return repr(divided_elements_of_interest) + "This is not part of the DOM, note that not the entire DOM was returned as it exceeded the context limit. If you're sure what you're need is not included in this DOM, you should find a workaround." return repr(elements_of_interest)