13 Commits
dpi ... master

Author SHA1 Message Date
Vik Paruchuri
cba67c6d15 Merge pull request #47 from datalab-to/dev
Shift import
2025-11-19 12:19:23 -05:00
Vik Paruchuri
7ac08e16e1 Merge pull request #46 from datalab-to/dev
Hide imports
2025-11-19 12:06:47 -05:00
Vik Paruchuri
7d967717a3 Merge pull request #41 from datalab-to/dev
Enable piping through params
2025-11-12 18:06:41 -05:00
Vik Paruchuri
94786516c8 Merge pull request #40 from datalab-to/dev
fix issue with pop
2025-11-12 17:34:47 -05:00
Vik Paruchuri
c735484ad4 Merge pull request #39 from datalab-to/dev
Enable passing bbox scale
2025-11-12 17:17:34 -05:00
Vik Paruchuri
914d508ddd Merge pull request #38 from datalab-to/dev
Add a small sleep
2025-11-12 16:06:21 -05:00
Vik Paruchuri
f04e0d146b Merge pull request #37 from datalab-to/dev
Dev
2025-11-12 16:03:23 -05:00
Vik Paruchuri
393d3d53f4 Merge pull request #32 from datalab-to/dev
Dev
2025-11-10 11:37:06 -05:00
Vik Paruchuri
5330679cf3 Merge pull request #22 from datalab-to/dev
Dev
2025-11-03 17:16:02 -05:00
Vik Paruchuri
b6320b09bd Merge pull request #10 from datalab-to/dev
Enable passing custom headers
2025-10-30 10:21:48 -04:00
Vik Paruchuri
b99aa32e19 Merge pull request #9 from datalab-to/dev
Improve robustness
2025-10-29 18:17:02 -04:00
Vik Paruchuri
ba6a5f71da Merge pull request #6 from datalab-to/dev
Change image rendering
2025-10-26 10:40:23 -04:00
Vik Paruchuri
57cb163663 Merge pull request #2 from datalab-to/dev
Dev
2025-10-23 16:55:57 -04:00

View File

@@ -1,4 +1,4 @@
from typing import List, Union, Optional
from typing import List
import filetype
from PIL import Image
import pypdfium2 as pdfium
@@ -27,80 +27,19 @@ def load_image(
def load_pdf_images(
filepath: str,
page_range: List[int],
image_dpi: Optional[Union[int, List[int]]] = None,
min_pdf_image_dim: Optional[Union[int, List[int]]] = None,
image_dpi: int = settings.IMAGE_DPI,
min_pdf_image_dim: int = settings.MIN_PDF_IMAGE_DIM,
) -> List[Image.Image]:
"""
Load PDF pages as images with configurable DPI.
Args:
filepath: Path to PDF file
page_range: List of page indices to render
image_dpi: Target DPI for rendering. Can be:
- None: use settings.IMAGE_DPI for all pages (default)
- int: use same DPI for all pages
- List[int]: per-page DPI (must match length of page_range)
min_pdf_image_dim: Minimum image dimension. Can be:
- None: use settings.MIN_PDF_IMAGE_DIM for all pages (default)
- int: use same value for all pages
- List[int]: per-page value (must match length of page_range)
Returns:
List of PIL Images, one per page in page_range
"""
doc = pdfium.PdfDocument(filepath)
doc.init_forms()
# Determine default values
default_image_dpi = image_dpi if isinstance(image_dpi, int) else settings.IMAGE_DPI
default_min_pdf_image_dim = min_pdf_image_dim if isinstance(min_pdf_image_dim, int) else settings.MIN_PDF_IMAGE_DIM
# Handle per-page DPI lists
is_per_page_dpi = isinstance(image_dpi, list)
if not is_per_page_dpi and image_dpi is not None:
# Convert single DPI value to list for all pages
image_dpi = [image_dpi] * len(page_range)
is_per_page_dpi = True
is_per_page_min_dim = isinstance(min_pdf_image_dim, list)
if not is_per_page_min_dim and min_pdf_image_dim is not None:
# Convert single min_dim value to list for all pages
min_pdf_image_dim = [min_pdf_image_dim] * len(page_range)
is_per_page_min_dim = True
if is_per_page_dpi and len(image_dpi) != len(page_range):
raise ValueError(f"image_dpi list length ({len(image_dpi)}) must match page_range length ({len(page_range)})")
if is_per_page_min_dim and len(min_pdf_image_dim) != len(page_range):
raise ValueError(f"min_pdf_image_dim list length ({len(min_pdf_image_dim)}) must match page_range length ({len(page_range)})")
images = []
page_idx_in_range = 0
for page in range(len(doc)):
if not page_range or page in page_range:
# Get DPI for this specific page
if is_per_page_dpi:
current_dpi = image_dpi[page_idx_in_range]
elif image_dpi is None:
current_dpi = settings.IMAGE_DPI
else:
current_dpi = default_image_dpi
# Get min_dim for this specific page
if is_per_page_min_dim:
current_min_dim = min_pdf_image_dim[page_idx_in_range]
elif min_pdf_image_dim is None:
current_min_dim = settings.MIN_PDF_IMAGE_DIM
else:
current_min_dim = default_min_pdf_image_dim
page_idx_in_range += 1
page_obj = doc[page]
min_page_dim = min(page_obj.get_width(), page_obj.get_height())
scale_dpi = (current_min_dim / min_page_dim) * 72
scale_dpi = max(scale_dpi, current_dpi)
scale_dpi = (min_pdf_image_dim / min_page_dim) * 72
scale_dpi = max(scale_dpi, image_dpi)
page_obj = doc[page]
flatten(page_obj)
page_obj = doc[page]