diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 4d4c1b41..1779472b 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -5,7 +5,7 @@ from pymongo import MongoClient from bson.objectid import ObjectId from werkzeug.utils import secure_filename -from application.api.user.tasks import ingest +from application.api.user.tasks import ingest, ingest_remote from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator @@ -157,6 +157,32 @@ def upload_file(): return {"status": "ok", "task_id": task_id} else: return {"status": "error"} + +@user.route("/api/remote", methods=["POST"]) +def upload_remote(): + """Upload a remote source to get vectorized and indexed.""" + if "user" not in request.form: + return {"status": "no user"} + user = secure_filename(request.form["user"]) + if "source" not in request.form: + return {"status": "no source"} + source = secure_filename(request.form["source"]) + if "name" not in request.form: + return {"status": "no name"} + job_name = secure_filename(request.form["name"]) + # check if the post request has the file part + if "data" not in request.form: + print("No data") + return {"status": "no data"} + source_data = request.form["data"] + + if source_data: + task = ingest_remote.delay(source_data=source_data, job_name=job_name, user=user, loader=source) + # task id + task_id = task.id + return {"status": "ok", "task_id": task_id} + else: + return {"status": "error"} @user.route("/api/task_status", methods=["GET"]) def task_status(): diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index a3474939..4602bf85 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -1,7 +1,12 @@ -from application.worker import ingest_worker +from application.worker import ingest_worker, remote_worker from application.celery import celery @celery.task(bind=True) def ingest(self, directory, formats, name_job, filename, user): resp = ingest_worker(self, directory, formats, name_job, filename, user) return resp + +@celery.task(bind=True) +def ingest_remote(self, source_data, job_name, user, loader): + resp = remote_worker(self, source_data, job_name, user, loader) + return resp diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py new file mode 100644 index 00000000..91313f22 --- /dev/null +++ b/application/parser/remote/base.py @@ -0,0 +1,19 @@ +"""Base reader class.""" +from abc import abstractmethod +from typing import Any, List + +from langchain.docstore.document import Document as LCDocument +from application.parser.schema.base import Document + + +class BaseRemote: + """Utilities for loading data from a directory.""" + + @abstractmethod + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + + def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py new file mode 100644 index 00000000..2a63f284 --- /dev/null +++ b/application/parser/remote/crawler_loader.py @@ -0,0 +1,59 @@ +import requests +from urllib.parse import urlparse, urljoin +from bs4 import BeautifulSoup +from application.parser.remote.base import BaseRemote + +class CrawlerLoader(BaseRemote): + def __init__(self, limit=10): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader # Initialize the document loader + self.limit = limit # Set the limit for the number of pages to scrape + + def load_data(self, inputs): + url = inputs + # Check if the input is a list and if it is, use the first element + if isinstance(url, list) and url: + url = url[0] + + # Check if the URL scheme is provided, if not, assume http + if not urlparse(url).scheme: + url = "http://" + url + + visited_urls = set() # Keep track of URLs that have been visited + base_url = urlparse(url).scheme + "://" + urlparse(url).hostname # Extract the base URL + urls_to_visit = [url] # List of URLs to be visited, starting with the initial URL + loaded_content = [] # Store the loaded content from each URL + + # Continue crawling until there are no more URLs to visit + while urls_to_visit: + current_url = urls_to_visit.pop(0) # Get the next URL to visit + visited_urls.add(current_url) # Mark the URL as visited + + # Try to load and process the content from the current URL + try: + response = requests.get(current_url) # Fetch the content of the current URL + response.raise_for_status() # Raise an exception for HTTP errors + loader = self.loader([current_url]) # Initialize the document loader for the current URL + loaded_content.extend(loader.load()) # Load the content and add it to the loaded_content list + except Exception as e: + # Print an error message if loading or processing fails and continue with the next URL + print(f"Error processing URL {current_url}: {e}") + continue + + # Parse the HTML content to extract all links + soup = BeautifulSoup(response.text, 'html.parser') + all_links = [ + urljoin(current_url, a['href']) + for a in soup.find_all('a', href=True) + if base_url in urljoin(current_url, a['href']) # Ensure links are from the same domain + ] + + # Add new links to the list of URLs to visit if they haven't been visited yet + urls_to_visit.extend([link for link in all_links if link not in visited_urls]) + urls_to_visit = list(set(urls_to_visit)) # Remove duplicate URLs + + # Stop crawling if the limit of pages to scrape is reached + if self.limit is not None and len(visited_urls) >= self.limit: + break + + return loaded_content # Return the loaded content from all visited URLs diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py new file mode 100644 index 00000000..e45333d4 --- /dev/null +++ b/application/parser/remote/remote_creator.py @@ -0,0 +1,18 @@ +from application.parser.remote.sitemap_loader import SitemapLoader +from application.parser.remote.crawler_loader import CrawlerLoader +from application.parser.remote.web_loader import WebLoader + + +class RemoteCreator: + loaders = { + 'url': WebLoader, + 'sitemap': SitemapLoader, + 'crawler': CrawlerLoader + } + + @classmethod + def create_loader(cls, type, *args, **kwargs): + loader_class = cls.loaders.get(type.lower()) + if not loader_class: + raise ValueError(f"No LLM class found for type {type}") + return loader_class(*args, **kwargs) \ No newline at end of file diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py new file mode 100644 index 00000000..6e9182c4 --- /dev/null +++ b/application/parser/remote/sitemap_loader.py @@ -0,0 +1,81 @@ +import requests +import re # Import regular expression library +import xml.etree.ElementTree as ET +from application.parser.remote.base import BaseRemote + +class SitemapLoader(BaseRemote): + def __init__(self, limit=20): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader + self.limit = limit # Adding limit to control the number of URLs to process + + def load_data(self, inputs): + sitemap_url= inputs + # Check if the input is a list and if it is, use the first element + if isinstance(sitemap_url, list) and sitemap_url: + url = sitemap_url[0] + + urls = self._extract_urls(sitemap_url) + if not urls: + print(f"No URLs found in the sitemap: {sitemap_url}") + return [] + + # Load content of extracted URLs + documents = [] + processed_urls = 0 # Counter for processed URLs + for url in urls: + if self.limit is not None and processed_urls >= self.limit: + break # Stop processing if the limit is reached + + try: + loader = self.loader([url]) + documents.extend(loader.load()) + processed_urls += 1 # Increment the counter after processing each URL + except Exception as e: + print(f"Error processing URL {url}: {e}") + continue + + return documents + + def _extract_urls(self, sitemap_url): + try: + response = requests.get(sitemap_url) + response.raise_for_status() # Raise an exception for HTTP errors + except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: + print(f"Failed to fetch sitemap: {sitemap_url}. Error: {e}") + return [] + + # Determine if this is a sitemap or a URL + if self._is_sitemap(response): + # It's a sitemap, so parse it and extract URLs + return self._parse_sitemap(response.content) + else: + # It's not a sitemap, return the URL itself + return [sitemap_url] + + def _is_sitemap(self, response): + content_type = response.headers.get('Content-Type', '') + if 'xml' in content_type or response.url.endswith('.xml'): + return True + + if ' void; +}) { + const [isOpen, setIsOpen] = useState(false); + return ( +
+ + {isOpen && ( +
+ {options.map((option, index) => ( +
+ { + onSelect(option); + setIsOpen(false); + }} + className="ml-2 flex-1 overflow-hidden overflow-ellipsis whitespace-nowrap px-1 py-3" + > + {option?.label} + +
+ ))} +
+ )} +
+ ); +} export default function Upload({ modalState, setModalState, @@ -14,6 +67,14 @@ export default function Upload({ setModalState: (state: ActiveState) => void; }) { const [docName, setDocName] = useState(''); + const [urlName, setUrlName] = useState('') + const [url, setUrl] = useState('') + const urlOptions: urlOption[] = [ + { label: 'Crawler', value: 'crawler' }, + // { label: 'Sitemap', value: 'sitemap' }, + { label: 'Link', value: 'url' }] + const [urlType, setUrlType] = useState(null) + const [activeTab, setActiveTab] = useState('file'); const [files, setfiles] = useState([]); const [progress, setProgress] = useState<{ type: 'UPLOAD' | 'TRAINIING'; @@ -55,9 +116,8 @@ export default function Upload({ setProgress(undefined); setModalState('INACTIVE'); }} - className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${ - isCancellable ? '' : 'hidden' - }`} + className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${isCancellable ? '' : 'hidden' + }`} > Finish @@ -149,6 +209,29 @@ export default function Upload({ xhr.send(formData); }; + const uploadRemote = () => { + console.log("here") + const formData = new FormData(); + formData.append('name', urlName); + formData.append('user', 'local'); + if (urlType !== null) { + formData.append('source', urlType?.value); + } + formData.append('data', url); + const apiHost = import.meta.env.VITE_API_HOST; + const xhr = new XMLHttpRequest(); + xhr.upload.addEventListener('progress', (event) => { + const progress = +((event.loaded / event.total) * 100).toFixed(2); + setProgress({ type: 'UPLOAD', percentage: progress }); + }); + xhr.onload = () => { + const { task_id } = JSON.parse(xhr.responseText); + setProgress({ type: 'TRAINIING', percentage: 0, taskId: task_id }); + }; + xhr.open('POST', `${apiHost + '/api/remote'}`); + xhr.send(formData); + }; + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, multiple: false, @@ -166,7 +249,6 @@ export default function Upload({ ['.docx'], }, }); - let view; if (progress?.type === 'UPLOAD') { view = ; @@ -176,42 +258,86 @@ export default function Upload({ view = ( <>

Upload New Documentation

-

- Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb -

- setDocName(e.target.value)} - > -
- Name -
-
- - - Choose Files - -
-
-

Uploaded Files

- {files.map((file) => ( -

- {file.name} -

- ))} - {files.length === 0 &&

None

} +
+ +
+ { + activeTab === 'file' && ( + <> + setDocName(e.target.value)} + > +
+ Name +
+
+ + + Choose Files + +
+

+ Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb +

+
+

Uploaded Files

+ {files.map((file) => ( +

+ {file.name} +

+ ))} + {files.length === 0 &&

None

} +
+ + ) + } + { + activeTab === 'remote' && ( + <> + setUrlType(value)} selectedOption={urlType} options={urlOptions} /> + setUrlName(e.target.value)} + > +
+ Name +
+ setUrl(e.target.value)} + > +
+ Link +
+ + ) + }
@@ -221,7 +347,7 @@ export default function Upload({ setfiles([]); setModalState('INACTIVE'); }} - className="font-medium dark:text-light-gray" + className="font-medium dark:text-light-gray cursor-pointer" > Cancel @@ -232,9 +358,8 @@ export default function Upload({ return (
{view} diff --git a/frontend/tailwind.config.cjs b/frontend/tailwind.config.cjs index 5946c5a3..50af33c8 100644 --- a/frontend/tailwind.config.cjs +++ b/frontend/tailwind.config.cjs @@ -43,7 +43,10 @@ module.exports = { 'dark-charcoal':'#2F3036', 'bright-gray':'#ECECF1', 'outer-space':'#444654', - 'gun-metal':'#2E303E' + 'gun-metal':'#2E303E', + 'sonic-silver':'#747474', + 'soap':'#D8CCF1', + 'independence':'#54546D' }, }, },