fix: avoid missing specialized keys in the options hash (#166)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-04-30 13:14:34 +02:00
committed by GitHub
parent 509f4889f8
commit 36787bc061
2 changed files with 57 additions and 12 deletions

View File

@@ -42,15 +42,12 @@ _log = logging.getLogger(__name__)
# Custom serializer for PdfFormatOption
# (model_dump_json does not work with some classes)
def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
data = pdf_format_option.model_dump()
data = pdf_format_option.model_dump(serialize_as_any=True)
# pipeline_options are not fully serialized by model_dump, dedicated pass
if pdf_format_option.pipeline_options:
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()
# Replace `artifacts_path` with a string representation
data["pipeline_options"]["artifacts_path"] = repr(
data["pipeline_options"]["artifacts_path"]
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
serialize_as_any=True, mode="json"
)
# Replace `pipeline_cls` with a string representation
@@ -59,12 +56,6 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
# Replace `backend` with a string representation
data["backend"] = repr(data["backend"])
# Handle `device` in `accelerator_options`
if "accelerator_options" in data and "device" in data["accelerator_options"]:
data["accelerator_options"]["device"] = repr(
data["accelerator_options"]["device"]
)
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
serialized_data = json.dumps(data, sort_keys=True)
options_hash = hashlib.sha1(serialized_data.encode()).digest()

View File

@@ -0,0 +1,54 @@
from docling_serve.datamodel.convert import (
ConvertDocumentsOptions,
PictureDescriptionApi,
)
from docling_serve.docling_conversion import (
_hash_pdf_format_option,
get_pdf_pipeline_opts,
)
def test_options_cache_key():
hashes = set()
opts = ConvertDocumentsOptions()
pipeline_opts = get_pdf_pipeline_opts(opts)
hash = _hash_pdf_format_option(pipeline_opts)
assert hash not in hashes
hashes.add(hash)
opts.do_picture_description = True
pipeline_opts = get_pdf_pipeline_opts(opts)
hash = _hash_pdf_format_option(pipeline_opts)
# pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
assert hash not in hashes
hashes.add(hash)
opts.picture_description_api = PictureDescriptionApi(
url="http://localhost",
params={"model": "mymodel"},
prompt="Hello 1",
)
pipeline_opts = get_pdf_pipeline_opts(opts)
hash = _hash_pdf_format_option(pipeline_opts)
# pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
assert hash not in hashes
hashes.add(hash)
opts.picture_description_api = PictureDescriptionApi(
url="http://localhost",
params={"model": "your-model"},
prompt="Hello 1",
)
pipeline_opts = get_pdf_pipeline_opts(opts)
hash = _hash_pdf_format_option(pipeline_opts)
# pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
assert hash not in hashes
hashes.add(hash)
opts.picture_description_api.prompt = "World"
pipeline_opts = get_pdf_pipeline_opts(opts)
hash = _hash_pdf_format_option(pipeline_opts)
# pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True))
assert hash not in hashes
hashes.add(hash)