docs: add split processing example (#303)

Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Tiago Santana
2025-09-04 09:42:11 +01:00
committed by GitHub
parent fe98338239
commit 0d4545a65a
6 changed files with 170 additions and 6 deletions

View File

@@ -0,0 +1,124 @@
import json
import time
from pathlib import Path
import httpx
from pydantic import BaseModel
from pypdf import PdfReader
from docling_core.types.doc.document import DoclingDocument
# Variables to use
path_to_pdf = Path("./tests/2206.01062v1.pdf")
pages_per_file = 4
base_url = "http://localhost:5001/v1"
out_dir = Path("examples/splitted_pdf/")
class ConvertedSplittedPdf(BaseModel):
task_id: str
conversion_finished: bool = False
result: dict | None = None
def get_task_result(task_id: str):
response = httpx.get(
f"{base_url}/result/{task_id}",
timeout=15,
)
return response.json()
def check_task_status(task_id: str):
response = httpx.get(f"{base_url}/status/poll/{task_id}", timeout=15)
task = response.json()
task_status = task["task_status"]
task_finished = False
if task_status == "success":
task_finished = True
if task_status in ("failure", "revoked"):
raise RuntimeError("A conversion failed")
time.sleep(5)
return task_finished
def post_file(file_path: Path, start_page: int, end_page: int):
payload = {
"to_formats": ["json"],
"image_export_mode": "placeholder",
"ocr": False,
"abort_on_error": False,
"page_range": [start_page, end_page],
}
files = {
"files": (file_path.name, file_path.open("rb"), "application/pdf"),
}
response = httpx.post(
f"{base_url}/convert/file/async",
files=files,
data=payload,
timeout=15,
)
task = response.json()
return task["task_id"]
def main():
filename = path_to_pdf
splitted_pdfs: list[ConvertedSplittedPdf] = []
with open(filename, "rb") as input_pdf_file:
pdf_reader = PdfReader(input_pdf_file)
total_pages = len(pdf_reader.pages)
for start_page in range(0, total_pages, pages_per_file):
task_id = post_file(
filename, start_page + 1, min(start_page + pages_per_file, total_pages)
)
splitted_pdfs.append(ConvertedSplittedPdf(task_id=task_id))
all_files_converted = False
while not all_files_converted:
found_conversion_running = False
for splitted_pdf in splitted_pdfs:
if not splitted_pdf.conversion_finished:
found_conversion_running = True
print("checking conversion status...")
splitted_pdf.conversion_finished = check_task_status(
splitted_pdf.task_id
)
if not found_conversion_running:
all_files_converted = True
for splitted_pdf in splitted_pdfs:
splitted_pdf.result = get_task_result(splitted_pdf.task_id)
files = []
for i, splitted_pdf in enumerate(splitted_pdfs):
json_content = json.dumps(
splitted_pdf.result.get("document").get("json_content"), indent=2
)
doc = DoclingDocument.model_validate_json(json_content)
filename = f"{out_dir}/splited_json_{i}.json"
doc.save_as_json(filename=filename)
files.append(filename)
docs = [DoclingDocument.load_from_json(filename=f) for f in files]
concate_doc = DoclingDocument.concatenate(docs=docs)
exp_json_file = Path(f"{out_dir}/concatenated.json")
concate_doc.save_as_json(exp_json_file)
print("Finished")
if __name__ == "__main__":
main()