DocsGPT/application/agents/tools/read_webpage.py

import requests
from markdownify import markdownify
from application.agents.tools.base import Tool
from application.core.url_validation import validate_url, SSRFError

class ReadWebpageTool(Tool):
    """
    Read Webpage (browser)
    A tool to fetch the HTML content of a URL and convert it to Markdown.
    """

    def __init__(self, config=None):
        """
        Initializes the tool.
        :param config: Optional configuration dictionary. Not used by this tool.
        """
        self.config = config

    def execute_action(self, action_name: str, **kwargs) -> str:
        """
        Executes the specified action. For this tool, the only action is 'read_webpage'.

        :param action_name: The name of the action to execute. Should be 'read_webpage'.
        :param kwargs: Keyword arguments, must include 'url'.
        :return: The Markdown content of the webpage or an error message.
        """
        if action_name != "read_webpage":
            return f"Error: Unknown action '{action_name}'. This tool only supports 'read_webpage'."

        url = kwargs.get("url")
        if not url:
            return "Error: URL parameter is missing."

        # Validate URL to prevent SSRF attacks
        try:
            url = validate_url(url)
        except SSRFError as e:
            return f"Error: URL validation failed - {e}"

        try:
            response = requests.get(url, timeout=10, headers={'User-Agent': 'DocsGPT-Agent/1.0'})
            response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)

            html_content = response.text
            #soup = BeautifulSoup(html_content, 'html.parser')


            markdown_content = markdownify(html_content, heading_style="ATX", newline_style="BACKSLASH")

            return markdown_content

        except requests.exceptions.RequestException as e:
            return f"Error fetching URL {url}: {e}"
        except Exception as e:
            return f"Error processing URL {url}: {e}"

    def get_actions_metadata(self):
        """
        Returns metadata for the actions supported by this tool.
        """
        return [
            {
                "name": "read_webpage",
                "description": "Fetches the HTML content of a given URL and returns it as clean Markdown text. Input must be a valid URL.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "url": {
                            "type": "string",
                            "description": "The fully qualified URL of the webpage to read (e.g., 'https://www.example.com').",
                        }
                    },
                    "required": ["url"],
                    "additionalProperties": False,
                },
            }
        ]

    def get_config_requirements(self):
        """
        Returns a dictionary describing the configuration requirements for the tool.
        This tool does not require any specific configuration.
        """
        return {}