-
Notifications
You must be signed in to change notification settings - Fork 9
Add file size validation to document upload endpoint #576
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,7 +83,7 @@ def recover_password(email: str, session: SessionDep) -> Message: | |
| return Message(message="Password recovery email sent") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why does this pr have changes of the other PR you made, this PR should only have file changes for this issue and nothing else
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I applied the changes can you please check |
||
|
|
||
|
|
||
| @router.post("/reset-password/", include_in_schema=False) | ||
| @router.post("/reset-password", include_in_schema=False) | ||
| def reset_password(session: SessionDep, body: NewPassword) -> Message: | ||
| """ | ||
| Reset password | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,6 +123,9 @@ def AWS_S3_BUCKET(self) -> str: | |
| CALLBACK_CONNECT_TIMEOUT: int = 3 | ||
| CALLBACK_READ_TIMEOUT: int = 10 | ||
|
|
||
| # Document upload size limit (in MB) | ||
| MAX_DOCUMENT_UPLOAD_SIZE_MB: int = 50 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since primarily we upload these files from s3 to openai, if we want to do anything with those files, and in openai single file size limit is 512 MB then we should keep it that only for now, or smth near that value - |
||
|
|
||
| @computed_field # type: ignore[prop-decorator] | ||
| @property | ||
| def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| """Validation utilities for document uploads.""" | ||
|
|
||
| import logging | ||
| from pathlib import Path | ||
|
|
||
| from fastapi import HTTPException, UploadFile | ||
|
|
||
| from app.core.config import settings | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Maximum file size for document uploads (in bytes) | ||
| # Default: 50 MB, configurable via settings | ||
| MAX_DOCUMENT_SIZE = settings.MAX_DOCUMENT_UPLOAD_SIZE_MB * 1024 * 1024 | ||
|
|
||
|
|
||
| async def validate_document_file(file: UploadFile) -> int: | ||
| """ | ||
| Validate document file size. | ||
| Args: | ||
| file: The uploaded file | ||
| Returns: | ||
| File size in bytes if valid | ||
| Raises: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this "raises" part is not needed in the function doc |
||
| HTTPException: If validation fails | ||
| """ | ||
| if not file.filename: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i do not think that it is possible that a file can be uploaded which does not have a filename |
||
| raise HTTPException( | ||
| status_code=422, | ||
| detail="File must have a filename", | ||
| ) | ||
|
|
||
| # Get file size by seeking to end | ||
| file.file.seek(0, 2) | ||
| file_size = file.file.tell() | ||
| file.file.seek(0) | ||
|
|
||
| if file_size > MAX_DOCUMENT_SIZE: | ||
| raise HTTPException( | ||
| status_code=413, | ||
| detail=f"File too large. Maximum size: {MAX_DOCUMENT_SIZE / (1024 * 1024):.0f}MB", | ||
| ) | ||
|
|
||
| if file_size == 0: | ||
| raise HTTPException( | ||
| status_code=422, | ||
| detail="Empty file uploaded" | ||
| ) | ||
|
|
||
| logger.info(f"Document file validated: {file.filename} ({file_size} bytes)") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefix and mask the log message. 🔧 Suggested change+from app.utils import mask_string
...
- logger.info(f"Document file validated: {file.filename} ({file_size} bytes)")
+ logger.info(
+ f"[validate_document_file] Document file validated: "
+ f"{mask_string(file.filename)} ({file_size} bytes)"
+ )🤖 Prompt for AI Agents
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we put function name in a bracket at the start of the log sentence so that its trackable, check our rest of the code |
||
| return file_size | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -325,3 +325,100 @@ def test_upload_response_structure_without_transformation( | |
| assert field in response.data | ||
|
|
||
| assert response.data["transformation_job"] is None | ||
|
|
||
| def test_upload_file_exceeds_size_limit( | ||
| self, | ||
| db: Session, | ||
| route: Route, | ||
| uploader: WebUploader, | ||
| ) -> None: | ||
| """Test that files exceeding the size limit are rejected.""" | ||
| aws = AmazonCloudStorageClient() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you hitting the real aws with this, or mock? |
||
| aws.create() | ||
|
|
||
| # Create a file larger than the 50MB limit | ||
| # For testing purposes, we'll create a 51MB file | ||
| with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: | ||
| # Write 51MB of data (51 * 1024 * 1024 bytes) | ||
| chunk_size = 1024 * 1024 # 1MB chunks | ||
| for _ in range(51): | ||
| fp.write(b"0" * chunk_size) | ||
| fp.flush() | ||
| large_file = Path(fp.name) | ||
|
|
||
| try: | ||
| response = uploader.put(route, large_file) | ||
|
|
||
| assert response.status_code == 413 | ||
| error_data = response.json() | ||
| assert "File too large" in error_data["error"] | ||
| assert "Maximum size: 50MB" in error_data["error"] | ||
|
|
||
| # Verify no document was created in the database | ||
| statement = select(Document).where(Document.fname == str(large_file)) | ||
| result = db.exec(statement).first() | ||
| assert result is None | ||
| finally: | ||
| large_file.unlink() | ||
|
|
||
| def test_upload_empty_file( | ||
| self, | ||
| db: Session, | ||
| route: Route, | ||
| uploader: WebUploader, | ||
| ) -> None: | ||
| """Test that empty files are rejected.""" | ||
| aws = AmazonCloudStorageClient() | ||
| aws.create() | ||
|
|
||
| # Create an empty file | ||
| with NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as fp: | ||
| # Don't write anything, just create an empty file | ||
| fp.flush() | ||
| empty_file = Path(fp.name) | ||
|
|
||
| try: | ||
| response = uploader.put(route, empty_file) | ||
|
|
||
| assert response.status_code == 422 | ||
| error_data = response.json() | ||
| assert "Empty file uploaded" in error_data["error"] | ||
|
|
||
| # Verify no document was created in the database | ||
| statement = select(Document).where(Document.fname == str(empty_file)) | ||
| result = db.exec(statement).first() | ||
| assert result is None | ||
| finally: | ||
| empty_file.unlink() | ||
|
|
||
| def test_upload_file_within_size_limit( | ||
| self, | ||
| db: Session, | ||
| route: Route, | ||
| uploader: WebUploader, | ||
| ) -> None: | ||
| """Test that files within the size limit are accepted.""" | ||
| aws = AmazonCloudStorageClient() | ||
| aws.create() | ||
|
|
||
| # Create a 1MB file (well within the 50MB limit) | ||
| with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp: | ||
| # Write 1MB of data | ||
| fp.write(b"0" * (1024 * 1024)) | ||
| fp.flush() | ||
| normal_file = Path(fp.name) | ||
|
|
||
| try: | ||
| response = httpx_to_standard(uploader.put(route, normal_file)) | ||
|
|
||
| assert response.success is True | ||
| assert "id" in response.data | ||
| doc_id = response.data["id"] | ||
|
|
||
| # Verify document was created in database | ||
| statement = select(Document).where(Document.id == doc_id) | ||
| result = db.exec(statement).one() | ||
| assert result.fname == str(normal_file) | ||
| finally: | ||
| normal_file.unlink() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,7 +72,7 @@ def test_reset_password(client: TestClient, db: Session) -> None: | |
| data = {"new_password": new_password, "token": token} | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, the changes from the other pr should not be highlighted here |
||
|
|
||
| r = client.post( | ||
| f"{settings.API_V1_STR}/reset-password/", | ||
| f"{settings.API_V1_STR}/reset-password", | ||
| headers=headers, | ||
| json=data, | ||
| ) | ||
|
|
@@ -89,7 +89,7 @@ def test_reset_password_invalid_token( | |
| ) -> None: | ||
| data = {"new_password": "changethis", "token": "invalid"} | ||
| r = client.post( | ||
| f"{settings.API_V1_STR}/reset-password/", | ||
| f"{settings.API_V1_STR}/reset-password", | ||
| headers=superuser_token_headers, | ||
| json=data, | ||
| ) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these docs are for non-technical people also, they do not need to know about the environment variable part