Skip to content

Commit

Permalink
Add required_exts option to SharePoint reader (#16152)
Browse files Browse the repository at this point in the history
  • Loading branch information
jl-martins authored Sep 22, 2024
1 parent 6a8a441 commit fe37352
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
import tempfile
from typing import Any, Dict, List, Union, Optional
from typing import Any, Dict, List, Optional

import requests
from llama_index.core.readers import SimpleDirectoryReader, FileSystemReaderMixin
Expand Down Expand Up @@ -37,6 +36,7 @@ class SharePointReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReade
sharepoint_folder_id (Optional[str]): The ID of the SharePoint folder to download from. Overrides sharepoint_folder_path.
drive_name (Optional[str]): The name of the drive to download from.
drive_id (Optional[str]): The ID of the drive to download from. Overrides drive_name.
required_exts (Optional[List[str]]): List of required extensions. Default is None.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that
file to text. See `SimpleDirectoryReader` for more details.
attach_permission_metadata (bool): If True, the reader will attach permission metadata to the documents. Set to False if your vector store
Expand All @@ -50,6 +50,7 @@ class SharePointReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReade
sharepoint_site_id: Optional[str] = None
sharepoint_folder_path: Optional[str] = None
sharepoint_folder_id: Optional[str] = None
required_exts: Optional[List[str]] = None
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field(
default=None, exclude=True
)
Expand All @@ -70,6 +71,7 @@ def __init__(
sharepoint_site_name: Optional[str] = None,
sharepoint_folder_path: Optional[str] = None,
sharepoint_folder_id: Optional[str] = None,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
drive_name: Optional[str] = None,
drive_id: Optional[str] = None,
Expand All @@ -82,6 +84,7 @@ def __init__(
sharepoint_site_name=sharepoint_site_name,
sharepoint_folder_path=sharepoint_folder_path,
sharepoint_folder_id=sharepoint_folder_id,
required_exts=required_exts,
file_extractor=file_extractor,
drive_name=drive_name,
drive_id=drive_id,
Expand Down Expand Up @@ -530,6 +533,7 @@ def get_metadata(filename: str) -> Any:

simple_loader = SimpleDirectoryReader(
download_dir,
required_exts=self.required_exts,
file_extractor=self.file_extractor,
file_metadata=get_metadata,
recursive=recursive,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ license = "MIT"
maintainers = ["arun-soliton"]
name = "llama-index-readers-microsoft-sharepoint"
readme = "README.md"
version = "0.3.1"
version = "0.3.2"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,47 @@ def test_load_documents_with_metadata(sharepoint_reader):
assert documents[1].metadata["file_name"] == "file2.txt"
assert documents[0].text == "File 1 content"
assert documents[1].text == "File 2 content"


def test_required_exts():
sharepoint_reader = SharePointReader(
client_id="dummy_client_id",
client_secret="dummy_client_secret",
tenant_id="dummy_tenant_id",
sharepoint_site_name="dummy_site_name",
sharepoint_folder_path="dummy_folder_path",
drive_name="dummy_drive_name",
required_exts=[".md"],
)

with tempfile.TemporaryDirectory() as tmpdirname:
readme_file_path = os.path.join(tmpdirname, "readme.md")
audio_file_path = os.path.join(tmpdirname, "audio.aac")
with open(readme_file_path, "w") as f:
f.write("Readme content")
with open(audio_file_path, "wb") as f:
f.write(bytearray([0xFF, 0xF1, 0x50, 0x80, 0x00, 0x7F, 0xFC, 0x00]))

file_metadata = {
readme_file_path: {
"file_id": "readme_file_id",
"file_name": "readme.md",
"url": "http://dummyurl/readme.md",
"file_path": readme_file_path,
},
audio_file_path: {
"file_id": "audio_file_id",
"file_name": "audio.aac",
"url": "http://dummyurl/audio.aac",
"file_path": audio_file_path,
},
}

documents = sharepoint_reader._load_documents_with_metadata(
file_metadata, tmpdirname, recursive=False
)

assert documents is not None
assert len(documents) == 1
assert documents[0].metadata["file_name"] == "readme.md"
assert documents[0].text == "Readme content"

0 comments on commit fe37352

Please sign in to comment.