Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backend/app/api/docs/documents/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ Upload a document to Kaapi.
- If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.
- If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed.

### File Size Restrictions

- **Maximum file size**: 50MB (configurable via `MAX_DOCUMENT_UPLOAD_SIZE_MB` environment variable)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these docs are for non-technical people also, they do not need to know about the environment variable part

- Files exceeding the size limit will be rejected with a 413 (Payload Too Large) error
- Empty files will be rejected with a 422 (Unprocessable Entity) error

### Supported Transformations

The following (source_format → target_format) transformations are supported:
Expand Down
4 changes: 4 additions & 0 deletions backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
build_document_schema,
build_document_schemas,
)
from app.services.documents.validators import validate_document_file
from app.utils import (
APIResponse,
get_openai_client,
Expand Down Expand Up @@ -123,6 +124,9 @@ async def upload_doc(
if callback_url:
validate_callback_url(callback_url)

# Validate file size before uploading to S3
await validate_document_file(src)

source_format, actual_transformer = pre_transform_validation(
src_filename=src.filename,
target_format=target_format,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def recover_password(email: str, session: SessionDep) -> Message:
return Message(message="Password recovery email sent")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this pr have changes of the other PR you made, this PR should only have file changes for this issue and nothing else

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I applied the changes can you please check



@router.post("/reset-password/", include_in_schema=False)
@router.post("/reset-password", include_in_schema=False)
def reset_password(session: SessionDep, body: NewPassword) -> Message:
"""
Reset password
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class PrivateUserCreate(BaseModel):
is_verified: bool = False


@router.post("/users/", response_model=UserPublic, include_in_schema=False)
@router.post("/users", response_model=UserPublic, include_in_schema=False)
def create_user(user_in: PrivateUserCreate, session: SessionDep) -> Any:
"""
Create a new user.
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def test_email(email_to: EmailStr) -> Message:
return Message(message="Test email sent")


@router.get("/health/", include_in_schema=False)
@router.get("/health", include_in_schema=False)
async def health_check() -> bool:
return True
3 changes: 3 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def AWS_S3_BUCKET(self) -> str:
CALLBACK_CONNECT_TIMEOUT: int = 3
CALLBACK_READ_TIMEOUT: int = 10

# Document upload size limit (in MB)
MAX_DOCUMENT_UPLOAD_SIZE_MB: int = 50
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


@computed_field # type: ignore[prop-decorator]
@property
def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int:
Expand Down
54 changes: 54 additions & 0 deletions backend/app/services/documents/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Validation utilities for document uploads."""

import logging
from pathlib import Path

from fastapi import HTTPException, UploadFile

from app.core.config import settings

logger = logging.getLogger(__name__)

# Maximum file size for document uploads (in bytes)
# Default: 50 MB, configurable via settings
MAX_DOCUMENT_SIZE = settings.MAX_DOCUMENT_UPLOAD_SIZE_MB * 1024 * 1024


async def validate_document_file(file: UploadFile) -> int:
"""
Validate document file size.
Args:
file: The uploaded file
Returns:
File size in bytes if valid
Raises:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this "raises" part is not needed in the function doc

HTTPException: If validation fails
"""
if not file.filename:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i do not think that it is possible that a file can be uploaded which does not have a filename

raise HTTPException(
status_code=422,
detail="File must have a filename",
)

# Get file size by seeking to end
file.file.seek(0, 2)
file_size = file.file.tell()
file.file.seek(0)

if file_size > MAX_DOCUMENT_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size: {MAX_DOCUMENT_SIZE / (1024 * 1024):.0f}MB",
)

if file_size == 0:
raise HTTPException(
status_code=422,
detail="Empty file uploaded"
)

logger.info(f"Document file validated: {file.filename} ({file_size} bytes)")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Prefix and mask the log message.
Logging must include the function-name prefix and mask sensitive values like filenames.

🔧 Suggested change
+from app.utils import mask_string
...
-    logger.info(f"Document file validated: {file.filename} ({file_size} bytes)")
+    logger.info(
+        f"[validate_document_file] Document file validated: "
+        f"{mask_string(file.filename)} ({file_size} bytes)"
+    )
As per coding guidelines, Prefix all log messages with the function name in square brackets: `logger.info(f"[function_name] Message {mask_string(sensitive_value)}")`.
🤖 Prompt for AI Agents
In `@backend/app/services/documents/validators.py` at line 53, The log line should
be prefixed with the function name and mask sensitive values: update the
logger.info call in the function (e.g., validate_document_file) to use the
"[validate_document_file]" prefix and call mask_string on file.filename and any
other sensitive fields (e.g., logger.info(f"[validate_document_file] Document
file validated: {mask_string(file.filename)} ({file_size} bytes)")). Ensure you
import/use the existing mask_string utility and keep file_size non-sensitive.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we put function name in a bracket at the start of the log sentence so that its trackable, check our rest of the code

return file_size
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,100 @@ def test_upload_response_structure_without_transformation(
assert field in response.data

assert response.data["transformation_job"] is None

def test_upload_file_exceeds_size_limit(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that files exceeding the size limit are rejected."""
aws = AmazonCloudStorageClient()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you hitting the real aws with this, or mock?

aws.create()

# Create a file larger than the 50MB limit
# For testing purposes, we'll create a 51MB file
with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp:
# Write 51MB of data (51 * 1024 * 1024 bytes)
chunk_size = 1024 * 1024 # 1MB chunks
for _ in range(51):
fp.write(b"0" * chunk_size)
fp.flush()
large_file = Path(fp.name)

try:
response = uploader.put(route, large_file)

assert response.status_code == 413
error_data = response.json()
assert "File too large" in error_data["error"]
assert "Maximum size: 50MB" in error_data["error"]

# Verify no document was created in the database
statement = select(Document).where(Document.fname == str(large_file))
result = db.exec(statement).first()
assert result is None
finally:
large_file.unlink()

def test_upload_empty_file(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that empty files are rejected."""
aws = AmazonCloudStorageClient()
aws.create()

# Create an empty file
with NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as fp:
# Don't write anything, just create an empty file
fp.flush()
empty_file = Path(fp.name)

try:
response = uploader.put(route, empty_file)

assert response.status_code == 422
error_data = response.json()
assert "Empty file uploaded" in error_data["error"]

# Verify no document was created in the database
statement = select(Document).where(Document.fname == str(empty_file))
result = db.exec(statement).first()
assert result is None
finally:
empty_file.unlink()

def test_upload_file_within_size_limit(
self,
db: Session,
route: Route,
uploader: WebUploader,
) -> None:
"""Test that files within the size limit are accepted."""
aws = AmazonCloudStorageClient()
aws.create()

# Create a 1MB file (well within the 50MB limit)
with NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) as fp:
# Write 1MB of data
fp.write(b"0" * (1024 * 1024))
fp.flush()
normal_file = Path(fp.name)

try:
response = httpx_to_standard(uploader.put(route, normal_file))

assert response.success is True
assert "id" in response.data
doc_id = response.data["id"]

# Verify document was created in database
statement = select(Document).where(Document.id == doc_id)
result = db.exec(statement).one()
assert result.fname == str(normal_file)
finally:
normal_file.unlink()

4 changes: 2 additions & 2 deletions backend/app/tests/api/routes/test_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_reset_password(client: TestClient, db: Session) -> None:
data = {"new_password": new_password, "token": token}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, the changes from the other pr should not be highlighted here


r = client.post(
f"{settings.API_V1_STR}/reset-password/",
f"{settings.API_V1_STR}/reset-password",
headers=headers,
json=data,
)
Expand All @@ -89,7 +89,7 @@ def test_reset_password_invalid_token(
) -> None:
data = {"new_password": "changethis", "token": "invalid"}
r = client.post(
f"{settings.API_V1_STR}/reset-password/",
f"{settings.API_V1_STR}/reset-password",
headers=superuser_token_headers,
json=data,
)
Expand Down
2 changes: 1 addition & 1 deletion backend/app/tests/api/routes/test_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def test_create_user(client: TestClient, db: Session) -> None:
r = client.post(
f"{settings.API_V1_STR}/private/users/",
f"{settings.API_V1_STR}/private/users",
json={
"email": "pollo@listo.com",
"password": "password123",
Expand Down