(feat:oneDrive) file loading for ingestion

This commit is contained in:
ManishMadan2882
2026-02-19 19:45:28 +05:30
parent 6257ca7935
commit 11e59540fb
6 changed files with 754 additions and 54 deletions

View File

@@ -0,0 +1,197 @@
"""Tests for SharePoint loader."""
import pytest
from unittest.mock import patch, MagicMock
from application.parser.connectors.share_point.loader import SharePointLoader
def make_response(json_data=None, status_code=200, raise_error=None):
resp = MagicMock()
resp.status_code = status_code
resp.json.return_value = json_data
resp.content = b"test content"
if raise_error is not None:
resp.raise_for_status.side_effect = raise_error
else:
resp.raise_for_status.return_value = None
return resp
class TestSharePointLoaderProcessFile:
"""Test _process_file method."""
def test_size_retrieved_from_root_level(self):
"""Should retrieve size from root of file_metadata, not nested file object."""
loader = SharePointLoader.__new__(SharePointLoader)
file_metadata = {
"id": "test-id",
"name": "test.txt",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-01-01T00:00:00Z",
"size": 1024,
"file": {
"mimeType": "text/plain"
}
}
doc = loader._process_file(file_metadata, load_content=False)
assert doc is not None
assert doc.extra_info["size"] == 1024
assert doc.extra_info["file_name"] == "test.txt"
assert doc.extra_info["mime_type"] == "text/plain"
def test_size_null_when_missing(self):
"""Should return None when size field is missing."""
loader = SharePointLoader.__new__(SharePointLoader)
file_metadata = {
"id": "test-id",
"name": "test.txt",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-01-01T00:00:00Z",
"file": {
"mimeType": "text/plain"
}
}
doc = loader._process_file(file_metadata, load_content=False)
assert doc is not None
assert doc.extra_info["size"] is None
class TestSharePointLoaderLoadFileById:
"""Test _load_file_by_id method."""
@patch("application.parser.connectors.share_point.loader.requests.get")
@patch("application.parser.connectors.share_point.loader.SharePointAuth.get_token_info_from_session")
@patch("application.parser.connectors.share_point.loader.SharePointLoader._ensure_valid_token")
def test_load_file_by_id_includes_size_in_select(self, mock_ensure_token, mock_get_token, mock_get):
"""Should include size field in $select parameter."""
mock_get_token.return_value = {
"access_token": "test-token",
"refresh_token": "test-refresh"
}
mock_get.return_value = make_response({
"id": "test-id",
"name": "test.txt",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-01-01T00:00:00Z",
"size": 2048,
"file": {
"mimeType": "text/plain"
}
})
loader = SharePointLoader("test-session")
doc = loader._load_file_by_id("test-id", load_content=False)
assert doc is not None
assert doc.extra_info["size"] == 2048
call_args = mock_get.call_args
params = call_args[1]["params"]
assert "size" in params["$select"]
@patch("application.parser.connectors.share_point.loader.requests.get")
@patch("application.parser.connectors.share_point.loader.SharePointAuth.get_token_info_from_session")
@patch("application.parser.connectors.share_point.loader.SharePointLoader._ensure_valid_token")
def test_load_file_by_id_returns_document_with_size(self, mock_ensure_token, mock_get_token, mock_get):
"""Should return document with size from API response."""
mock_get_token.return_value = {
"access_token": "test-token",
"refresh_token": "test-refresh"
}
mock_get.return_value = make_response({
"id": "test-id",
"name": "document.pdf",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-06-15T10:30:00Z",
"size": 56789,
"file": {
"mimeType": "application/pdf"
}
})
loader = SharePointLoader("test-session")
doc = loader._load_file_by_id("test-id", load_content=False)
assert doc is not None
assert doc.doc_id == "test-id"
assert doc.extra_info["file_name"] == "document.pdf"
assert doc.extra_info["mime_type"] == "application/pdf"
assert doc.extra_info["size"] == 56789
assert doc.extra_info["created_time"] == "2024-01-01T00:00:00Z"
assert doc.extra_info["modified_time"] == "2024-06-15T10:30:00Z"
assert doc.extra_info["source"] == "share_point"
class TestSharePointLoaderListItems:
"""Test _list_items_in_parent method."""
@patch("application.parser.connectors.share_point.loader.requests.get")
@patch("application.parser.connectors.share_point.loader.SharePointAuth.get_token_info_from_session")
@patch("application.parser.connectors.share_point.loader.SharePointLoader._ensure_valid_token")
def test_list_items_includes_size_in_select(self, mock_ensure_token, mock_get_token, mock_get):
"""Should include size field in $select parameter when listing items."""
mock_get_token.return_value = {
"access_token": "test-token",
"refresh_token": "test-refresh"
}
mock_get.return_value = make_response({
"value": [
{
"id": "file-1",
"name": "file1.txt",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-01-01T00:00:00Z",
"size": 12345,
"file": {
"mimeType": "text/plain"
}
}
]
})
loader = SharePointLoader("test-session")
docs = loader._list_items_in_parent("parent-id", limit=10, load_content=False)
assert len(docs) == 1
assert docs[0].extra_info["size"] == 12345
call_args = mock_get.call_args
params = call_args[1]["params"]
assert "size" in params["$select"]
@patch("application.parser.connectors.share_point.loader.requests.get")
@patch("application.parser.connectors.share_point.loader.SharePointAuth.get_token_info_from_session")
@patch("application.parser.connectors.share_point.loader.SharePointLoader._ensure_valid_token")
def test_list_items_folders_include_size(self, mock_ensure_token, mock_get_token, mock_get):
"""Should include size for folders as well."""
mock_get_token.return_value = {
"access_token": "test-token",
"refresh_token": "test-refresh"
}
mock_get.return_value = make_response({
"value": [
{
"id": "folder-1",
"name": "MyFolder",
"createdDateTime": "2024-01-01T00:00:00Z",
"lastModifiedDateTime": "2024-01-01T00:00:00Z",
"size": 0,
"folder": {}
}
]
})
loader = SharePointLoader("test-session")
docs = loader._list_items_in_parent("parent-id", limit=10, load_content=False)
assert len(docs) == 1
assert docs[0].extra_info["is_folder"] is True
assert docs[0].extra_info["size"] == 0