diff --git a/docling_serve/docling_conversion.py b/docling_serve/docling_conversion.py index 3895a9b..c8dc448 100644 --- a/docling_serve/docling_conversion.py +++ b/docling_serve/docling_conversion.py @@ -42,15 +42,12 @@ _log = logging.getLogger(__name__) # Custom serializer for PdfFormatOption # (model_dump_json does not work with some classes) def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes: - data = pdf_format_option.model_dump() + data = pdf_format_option.model_dump(serialize_as_any=True) # pipeline_options are not fully serialized by model_dump, dedicated pass if pdf_format_option.pipeline_options: - data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump() - - # Replace `artifacts_path` with a string representation - data["pipeline_options"]["artifacts_path"] = repr( - data["pipeline_options"]["artifacts_path"] + data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump( + serialize_as_any=True, mode="json" ) # Replace `pipeline_cls` with a string representation @@ -59,12 +56,6 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes: # Replace `backend` with a string representation data["backend"] = repr(data["backend"]) - # Handle `device` in `accelerator_options` - if "accelerator_options" in data and "device" in data["accelerator_options"]: - data["accelerator_options"]["device"] = repr( - data["accelerator_options"]["device"] - ) - # Serialize the dictionary to JSON with sorted keys to have consistent hashes serialized_data = json.dumps(data, sort_keys=True) options_hash = hashlib.sha1(serialized_data.encode()).digest() diff --git a/tests/test_options_serialization.py b/tests/test_options_serialization.py new file mode 100644 index 0000000..a2f3b40 --- /dev/null +++ b/tests/test_options_serialization.py @@ -0,0 +1,54 @@ +from docling_serve.datamodel.convert import ( + ConvertDocumentsOptions, + PictureDescriptionApi, +) +from docling_serve.docling_conversion import ( + _hash_pdf_format_option, + get_pdf_pipeline_opts, +) + + +def test_options_cache_key(): + hashes = set() + + opts = ConvertDocumentsOptions() + pipeline_opts = get_pdf_pipeline_opts(opts) + hash = _hash_pdf_format_option(pipeline_opts) + assert hash not in hashes + hashes.add(hash) + + opts.do_picture_description = True + pipeline_opts = get_pdf_pipeline_opts(opts) + hash = _hash_pdf_format_option(pipeline_opts) + # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True)) + assert hash not in hashes + hashes.add(hash) + + opts.picture_description_api = PictureDescriptionApi( + url="http://localhost", + params={"model": "mymodel"}, + prompt="Hello 1", + ) + pipeline_opts = get_pdf_pipeline_opts(opts) + hash = _hash_pdf_format_option(pipeline_opts) + # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True)) + assert hash not in hashes + hashes.add(hash) + + opts.picture_description_api = PictureDescriptionApi( + url="http://localhost", + params={"model": "your-model"}, + prompt="Hello 1", + ) + pipeline_opts = get_pdf_pipeline_opts(opts) + hash = _hash_pdf_format_option(pipeline_opts) + # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True)) + assert hash not in hashes + hashes.add(hash) + + opts.picture_description_api.prompt = "World" + pipeline_opts = get_pdf_pipeline_opts(opts) + hash = _hash_pdf_format_option(pipeline_opts) + # pprint(pipeline_opts.pipeline_options.model_dump(serialize_as_any=True)) + assert hash not in hashes + hashes.add(hash)