From 87c8e4003e5115ad868564bddeeceda2bab2d7c3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 25 Jan 2026 11:32:49 +0530 Subject: [PATCH 01/15] Add score trace URL to evaluation run --- ...1_add_score_trace_url_to_evaluation_run.py | 32 +++++++++ backend/app/core/storage_utils.py | 37 ++++++++++- backend/app/crud/evaluations/core.py | 66 +++++++++++++++++-- backend/app/models/evaluation.py | 7 ++ .../app/services/evaluations/evaluation.py | 44 ++++++++++++- 5 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py diff --git a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py new file mode 100644 index 000000000..60d5b4c26 --- /dev/null +++ b/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py @@ -0,0 +1,32 @@ +"""Add score_trace_url to evaluation_run + +Revision ID: 041 +Revises: 040 +Create Date: 2026-01-24 19:34:46.763908 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes + + +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "evaluation_run", + sa.Column( + "score_trace_url", + sqlmodel.sql.sqltypes.AutoString(), + nullable=True, + comment="S3 URL where per-trace evaluation scores are stored", + ), + ) + + +def downgrade(): + op.drop_column("evaluation_run", "score_trace_url") diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 63830d7d0..5b557872a 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -88,6 +88,7 @@ def upload_jsonl_to_object_store( results: list[dict], filename: str, subdirectory: str, + as_json_array: bool = False, ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -115,7 +116,10 @@ def upload_jsonl_to_object_store( file_path = Path(subdirectory) / filename # Convert results to JSONL - jsonl_content = "\n".join([json.dumps(result) for result in results]) + if as_json_array: + jsonl_content = json.dumps(results, ensure_ascii=False) + else: + jsonl_content = "\n".join([json.dumps(result) for result in results]) content_bytes = jsonl_content.encode("utf-8") # Create UploadFile-like object @@ -152,6 +156,37 @@ def upload_jsonl_to_object_store( return None +def load_json_from_object_store(storage: CloudStorage, url: str) -> list | None: + logger.info(f"[load_json_from_object_store] Loading JSON from '{url}") + try: + body = storage.stream(url) + content = body.read() + + data = json.loads(content.decode("utf-8")) + + logger.info( + f"[load_json_from_object_store] Download successful | " + f"url='{url}', size={len(content)} bytes" + ) + return data + except CloudStorageError as e: + logger.warning( + f"[load_json_from_object_store] failed to load JSON from '{url}': {e}", + ) + return None + except json.JSONDecodeError as e: + logger.warning( + f"[load_json_from_object_store] JSON decode error loading JSON from '{url}': {e}", + ) + return None + except Exception as e: + logger.warning( + f"[load_json_from_object_store] unexpected error loading JSON from '{url}': {e}", + exc_info=True, + ) + return None + + def generate_timestamped_filename(base_name: str, extension: str = "csv") -> str: """ Generate a filename with timestamp. diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 33b6777f3..47526aa27 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -147,6 +147,7 @@ def update_evaluation_run( status: str | None = None, error_message: str | None = None, object_store_url: str | None = None, + score_trace_url: str | None = None, score: dict | None = None, embedding_batch_job_id: int | None = None, ) -> EvaluationRun: @@ -179,6 +180,8 @@ def update_evaluation_run( eval_run.score = score if embedding_batch_job_id is not None: eval_run.embedding_batch_job_id = embedding_batch_job_id + if score_trace_url is not None: + eval_run.score_trace_url = score_trace_url # Always update timestamp eval_run.updated_at = now() @@ -296,6 +299,8 @@ def save_score( Updated EvaluationRun instance, or None if not found """ from app.core.db import engine + from app.core.cloud.storage import get_cloud_storage + from app.core.storage_utils import upload_jsonl_to_object_store with Session(engine) as session: eval_run = get_evaluation_run_by_id( @@ -304,10 +309,59 @@ def save_score( organization_id=organization_id, project_id=project_id, ) - if eval_run: - update_evaluation_run(session=session, eval_run=eval_run, score=score) - logger.info( - f"[save_score] Saved score | evaluation_id={eval_run_id} | " - f"traces={len(score.get('traces', []))}" - ) + if not eval_run: + return None + + traces = score.get("traces", []) + summary_score = score.get("summary_scores", []) + score_trace_url = None + + if traces: + try: + storage = get_cloud_storage(session=session, project_id=project_id) + score_trace_url = upload_jsonl_to_object_store( + storage=storage, + results=traces, + filename=f"traces_{eval_run_id}.json", + subdirectory=f"evaluations/score/{eval_run_id}", + as_json_array=True, + ) + if score_trace_url: + logger.info( + f"[save_score] uploaded traces to S3 | " + f"evaluation_id={eval_run_id} | url={score_trace_url} | " + f"traces_count={len(traces)}" + ) + else: + logger.warning( + f"[save_score] failed to upload traces to S3, " + f"falling back to DB storage | evaluation_id={eval_run_id}" + ) + except Exception as e: + logger.error( + f"[save_score] Error uploading traces to S3: {e} | " + f"evaluation_id={eval_run_id}", + exc_info=True, + ) + + # IF TRACES DATA IS STORED IN S3 URL THEN HERE WE ARE JUST STORING THE SUMMARY SCORE + # TODO: Evaluate weather this behaviour is needed or completely discard the storing data in db + if score_trace_url: + db_score = {"summary_scores": summary_score} + else: + # fallback to store data in db if failed to store in s3 + db_score = score + + update_evaluation_run( + session=session, + eval_run=eval_run, + score=db_score, + score_trace_url=score_trace_url, + ) + + logger.info( + f"[save_score] Saved score | evaluation_id={eval_run_id} | " + f"traces={len(score.get('traces', []))}" + ) + return eval_run diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27e..be489b8af 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -247,6 +247,13 @@ class EvaluationRun(SQLModel, table=True): description="Object store URL of processed evaluation results for future reference", sa_column_kwargs={"comment": "S3 URL of processed evaluation results"}, ) + score_trace_url: str | None = SQLField( + default=None, + description="S3 URL per-trace score data is stored", + sa_column_kwargs={ + "comment": "S3 URL where per-trace evaluation scores are stored" + }, + ) total_items: int = SQLField( default=0, description="Total number of items evaluated (set during processing)", diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 4c1a5de74..620f3f929 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -244,6 +244,9 @@ def get_evaluation_with_scores( Returns: Tuple of (EvaluationRun or None, error_message or None) """ + from app.core.cloud.storage import get_cloud_storage + from app.core.storage_utils import load_json_from_object_store + logger.info( f"[get_evaluation_with_scores] Fetching status for evaluation run | " f"evaluation_id={evaluation_id} | " @@ -282,9 +285,41 @@ def get_evaluation_with_scores( return eval_run, None # Check if we already have cached traces - has_cached_traces = eval_run.score is not None and "traces" in eval_run.score - if not resync_score and has_cached_traces: - return eval_run, None + has_cached_traces_s3 = eval_run.score_trace_url is not None + has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score + if not resync_score: + if has_cached_traces_s3: + try: + storage = get_cloud_storage(session=session, project_id=project_id) + traces = load_json_from_object_store( + storage=storage, url=eval_run.score_trace_url + ) + if traces is not None: + eval_run.score = { + "summary_scores": (eval_run.score or {}).get( + "summary_scores", [] + ), + "traces": traces, + } + logger.info( + f"[get_evaluation_with_scores] Loaded traces from S3 | " + f"evaluation_id={evaluation_id} | " + f"traces_count={len(traces)}" + ) + return eval_run, None + except Exception as e: + logger.error( + f"[get_evaluation_with_scores] Error loading traces from S3: {e} | " + f"evaluation_id={evaluation_id}", + exc_info=True, + ) + + elif has_cached_traces_db: + logger.info( + f"[get_evaluation_with_scores] Returning traces from DB | " + f"evaluation_id={evaluation_id}" + ) + return eval_run, None langfuse = get_langfuse_client( session=session, @@ -344,4 +379,7 @@ def get_evaluation_with_scores( score=score, ) + if eval_run: + eval_run.score = score + return eval_run, None From 87363113f8c89909d85ee27050d8791067aca502 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 29 Jan 2026 12:07:22 +0530 Subject: [PATCH 02/15] Migration: Add score_trace_url column to evaluation_run --- ...un.py => 042_add_score_trace_url_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{041_add_score_trace_url_to_evaluation_run.py => 042_add_score_trace_url_to_evaluation_run.py} (88%) diff --git a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py similarity index 88% rename from backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py rename to backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py index 60d5b4c26..46231d4b2 100644 --- a/backend/app/alembic/versions/041_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py @@ -1,7 +1,7 @@ """Add score_trace_url to evaluation_run -Revision ID: 041 -Revises: 040 +Revision ID: 042 +Revises: 041 Create Date: 2026-01-24 19:34:46.763908 """ @@ -10,8 +10,8 @@ import sqlmodel.sql.sqltypes -revision = "041" -down_revision = "040" +revision = "042" +down_revision = "041" branch_labels = None depends_on = None From 0ac775e974c562413334e9cafd43bb37e3af74e3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 00:01:00 +0530 Subject: [PATCH 03/15] Migration: Add score_trace_url column to evaluation_run --- ...un.py => 043_add_score_trace_url_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{042_add_score_trace_url_to_evaluation_run.py => 043_add_score_trace_url_to_evaluation_run.py} (88%) diff --git a/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py similarity index 88% rename from backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py rename to backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py index 46231d4b2..19620401d 100644 --- a/backend/app/alembic/versions/042_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py @@ -1,7 +1,7 @@ """Add score_trace_url to evaluation_run -Revision ID: 042 -Revises: 041 +Revision ID: 043 +Revises: 042 Create Date: 2026-01-24 19:34:46.763908 """ @@ -10,8 +10,8 @@ import sqlmodel.sql.sqltypes -revision = "042" -down_revision = "041" +revision = "043" +down_revision = "042" branch_labels = None depends_on = None From 6e3d12e082a8ca916f43e9eb2a884bc6cb76827d Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 23:40:53 +0530 Subject: [PATCH 04/15] fixes - test added - bug fixes --- backend/app/core/storage_utils.py | 9 +- backend/app/crud/evaluations/core.py | 3 +- backend/app/tests/core/test_storage_utils.py | 96 ++++++++++++++ .../crud/evaluations/test_score_storage.py | 90 ++++++++++++++ backend/app/tests/services/__init__.py | 0 .../tests/services/evaluations/__init__.py | 1 + .../evaluations/test_evaluation_service_s3.py | 117 ++++++++++++++++++ 7 files changed, 307 insertions(+), 9 deletions(-) create mode 100644 backend/app/tests/core/test_storage_utils.py create mode 100644 backend/app/tests/crud/evaluations/test_score_storage.py create mode 100644 backend/app/tests/services/__init__.py create mode 100644 backend/app/tests/services/evaluations/__init__.py create mode 100644 backend/app/tests/services/evaluations/test_evaluation_service_s3.py diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 5b557872a..6a4ae43d8 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -87,8 +87,7 @@ def upload_jsonl_to_object_store( storage: CloudStorage, results: list[dict], filename: str, - subdirectory: str, - as_json_array: bool = False, + subdirectory: str ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -115,11 +114,7 @@ def upload_jsonl_to_object_store( # Create file path file_path = Path(subdirectory) / filename - # Convert results to JSONL - if as_json_array: - jsonl_content = json.dumps(results, ensure_ascii=False) - else: - jsonl_content = "\n".join([json.dumps(result) for result in results]) + jsonl_content = json.dumps(results, ensure_ascii=False) content_bytes = jsonl_content.encode("utf-8") # Create UploadFile-like object diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 8f393e90b..afcec9261 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -363,8 +363,7 @@ def save_score( storage=storage, results=traces, filename=f"traces_{eval_run_id}.json", - subdirectory=f"evaluations/score/{eval_run_id}", - as_json_array=True, + subdirectory=f"evaluations/score/{eval_run_id}" ) if score_trace_url: logger.info( diff --git a/backend/app/tests/core/test_storage_utils.py b/backend/app/tests/core/test_storage_utils.py new file mode 100644 index 000000000..687230054 --- /dev/null +++ b/backend/app/tests/core/test_storage_utils.py @@ -0,0 +1,96 @@ +"""Tests for storage_utils.py - upload and load functions for object store.""" + +import json +from io import BytesIO +from unittest.mock import MagicMock + +import pytest + +from app.core.cloud.storage import CloudStorageError +from app.core.storage_utils import ( + load_json_from_object_store, + upload_jsonl_to_object_store, +) + + +class TestUploadJsonlToObjectStore: + """Test uploading JSON content to object store.""" + + def test_upload_success(self) -> None: + """Verify successful upload returns URL and content is correct.""" + mock_storage = MagicMock() + mock_storage.put.return_value = "s3://bucket/path/traces.json" + + results = [{"trace_id": "t1", "score": 0.9}] + + url = upload_jsonl_to_object_store( + storage=mock_storage, + results=results, + filename="traces.json", + subdirectory="evaluations/score/70", + ) + + assert url == "s3://bucket/path/traces.json" + mock_storage.put.assert_called_once() + + # Verify content is valid JSON + call_args = mock_storage.put.call_args + upload_file = call_args.kwargs.get("source") + content = upload_file.file.read().decode("utf-8") + assert json.loads(content) == results + + def test_upload_returns_none_on_error(self) -> None: + """Verify returns None on CloudStorageError.""" + mock_storage = MagicMock() + mock_storage.put.side_effect = CloudStorageError("Upload failed") + + url = upload_jsonl_to_object_store( + storage=mock_storage, + results=[{"id": 1}], + filename="test.json", + subdirectory="test", + ) + + assert url is None + + +class TestLoadJsonFromObjectStore: + """Test loading JSON from object store.""" + + def test_load_success(self) -> None: + """Verify successful load returns parsed JSON.""" + mock_storage = MagicMock() + test_data = [{"id": 1, "value": "test"}] + mock_storage.stream.return_value = BytesIO(json.dumps(test_data).encode("utf-8")) + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/path/file.json", + ) + + assert result == test_data + mock_storage.stream.assert_called_once_with("s3://bucket/path/file.json") + + def test_load_returns_none_on_error(self) -> None: + """Verify returns None on CloudStorageError.""" + mock_storage = MagicMock() + mock_storage.stream.side_effect = CloudStorageError("Download failed") + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/file.json", + ) + + assert result is None + + def test_load_returns_none_on_invalid_json(self) -> None: + """Verify returns None on invalid JSON content.""" + mock_storage = MagicMock() + mock_storage.stream.return_value = BytesIO(b"not valid json") + + result = load_json_from_object_store( + storage=mock_storage, + url="s3://bucket/file.json", + ) + + assert result is None diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py new file mode 100644 index 000000000..059880076 --- /dev/null +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -0,0 +1,90 @@ +"""Tests for save_score() S3 upload functionality.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from app.crud.evaluations.core import save_score +from app.models import EvaluationRun + + +class TestSaveScoreS3Upload: + """Test save_score() S3 upload functionality.""" + + @pytest.fixture + def mock_eval_run(self): + """Create a mock EvaluationRun.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 100 + return eval_run + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_uploads_traces_to_s3_and_stores_summary_only( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify traces uploaded to S3, only summary_scores stored in DB.""" + mock_get_eval.return_value = mock_eval_run + mock_get_storage.return_value = MagicMock() + mock_upload.return_value = "s3://bucket/traces.json" + + score = { + "summary_scores": [{"name": "accuracy", "avg": 0.9}], + "traces": [{"trace_id": "t1"}], + } + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + # Verify upload was called with traces + mock_upload.assert_called_once() + assert mock_upload.call_args.kwargs["results"] == [{"trace_id": "t1"}] + + # Verify DB gets summary only, not traces + call_kwargs = mock_update.call_args.kwargs + assert call_kwargs["score"] == {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + assert call_kwargs["score_trace_url"] == "s3://bucket/traces.json" + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_fallback_to_db_when_s3_fails( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify full score stored in DB when S3 upload fails.""" + mock_get_eval.return_value = mock_eval_run + mock_get_storage.return_value = MagicMock() + mock_upload.return_value = None # S3 failed + + score = { + "summary_scores": [{"name": "accuracy", "avg": 0.9}], + "traces": [{"trace_id": "t1"}], + } + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + # Full score stored in DB as fallback + call_kwargs = mock_update.call_args.kwargs + assert call_kwargs["score"] == score + assert call_kwargs["score_trace_url"] is None + + @patch("app.crud.evaluations.core.update_evaluation_run") + @patch("app.crud.evaluations.core.get_evaluation_run_by_id") + @patch("app.core.storage_utils.upload_jsonl_to_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.core.db.engine") + def test_no_s3_upload_when_no_traces( + self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + ) -> None: + """Verify S3 upload skipped when traces is empty.""" + mock_get_eval.return_value = mock_eval_run + + score = {"summary_scores": [{"name": "accuracy", "avg": 0.9}], "traces": []} + + save_score(eval_run_id=100, organization_id=1, project_id=1, score=score) + + mock_upload.assert_not_called() diff --git a/backend/app/tests/services/__init__.py b/backend/app/tests/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/services/evaluations/__init__.py b/backend/app/tests/services/evaluations/__init__.py new file mode 100644 index 000000000..293031958 --- /dev/null +++ b/backend/app/tests/services/evaluations/__init__.py @@ -0,0 +1 @@ +"""Evaluation service tests.""" diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py new file mode 100644 index 000000000..ea28d2aca --- /dev/null +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -0,0 +1,117 @@ +"""Tests for get_evaluation_with_scores() S3 retrieval.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from app.models import EvaluationRun +from app.services.evaluations.evaluation import get_evaluation_with_scores + + +class TestGetEvaluationWithScoresS3: + """Test get_evaluation_with_scores() S3 retrieval.""" + + @pytest.fixture + def completed_eval_run_with_s3(self): + """Completed eval run with S3 URL.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 100 + eval_run.status = "completed" + eval_run.score = {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + eval_run.score_trace_url = "s3://bucket/traces.json" + eval_run.dataset_name = "test_dataset" + eval_run.run_name = "test_run" + return eval_run + + @pytest.fixture + def completed_eval_run_with_db_traces(self): + """Completed eval run with traces in DB.""" + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = 101 + eval_run.status = "completed" + eval_run.score = { + "summary_scores": [{"name": "accuracy", "avg": 0.85}], + "traces": [{"trace_id": "db_trace"}], + } + eval_run.score_trace_url = None + return eval_run + + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.storage_utils.load_json_from_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_loads_traces_from_s3( + self, mock_get_storage, mock_load, mock_get_eval, completed_eval_run_with_s3 + ) -> None: + """Verify traces loaded from S3 and score reconstructed.""" + mock_get_eval.return_value = completed_eval_run_with_s3 + mock_get_storage.return_value = MagicMock() + mock_load.return_value = [{"trace_id": "s3_trace"}] + + result, error = get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=100, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=False, + ) + + assert error is None + mock_load.assert_called_once() + assert result.score["traces"] == [{"trace_id": "s3_trace"}] + assert result.score["summary_scores"] == [{"name": "accuracy", "avg": 0.9}] + + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_returns_db_traces_when_no_s3_url( + self, mock_get_storage, mock_get_eval, completed_eval_run_with_db_traces + ) -> None: + """Verify DB traces returned when no S3 URL.""" + mock_get_eval.return_value = completed_eval_run_with_db_traces + + result, error = get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=101, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=False, + ) + + assert error is None + mock_get_storage.assert_not_called() + assert result.score["traces"] == [{"trace_id": "db_trace"}] + + @patch("app.services.evaluations.evaluation.save_score") + @patch("app.services.evaluations.evaluation.fetch_trace_scores_from_langfuse") + @patch("app.services.evaluations.evaluation.get_langfuse_client") + @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") + @patch("app.core.storage_utils.load_json_from_object_store") + @patch("app.core.cloud.storage.get_cloud_storage") + def test_resync_bypasses_cache_and_fetches_langfuse( + self, + mock_get_storage, + mock_load, + mock_get_eval, + mock_get_langfuse, + mock_fetch_langfuse, + mock_save_score, + completed_eval_run_with_s3, + ) -> None: + """Verify resync=True skips S3/DB and fetches from Langfuse.""" + mock_get_eval.return_value = completed_eval_run_with_s3 + mock_get_langfuse.return_value = MagicMock() + mock_fetch_langfuse.return_value = {"summary_scores": [], "traces": [{"trace_id": "new"}]} + mock_save_score.return_value = completed_eval_run_with_s3 + + get_evaluation_with_scores( + session=MagicMock(), + evaluation_id=100, + organization_id=1, + project_id=1, + get_trace_info=True, + resync_score=True, + ) + + mock_load.assert_not_called() # S3 skipped + mock_fetch_langfuse.assert_called_once() # Langfuse called From afc0d700b62ea2b7e65a22bdc8b3e06653660751 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 31 Jan 2026 23:44:52 +0530 Subject: [PATCH 05/15] Refactor: Improve formatting and readability in storage utility and test files --- backend/app/core/storage_utils.py | 5 +--- backend/app/crud/evaluations/core.py | 2 +- backend/app/tests/core/test_storage_utils.py | 4 ++- .../crud/evaluations/test_score_storage.py | 28 ++++++++++++++++--- .../evaluations/test_evaluation_service_s3.py | 5 +++- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index 6a4ae43d8..bf87033f0 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -84,10 +84,7 @@ def __init__(self, content: bytes): def upload_jsonl_to_object_store( - storage: CloudStorage, - results: list[dict], - filename: str, - subdirectory: str + storage: CloudStorage, results: list[dict], filename: str, subdirectory: str ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index afcec9261..7e2b593b2 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -363,7 +363,7 @@ def save_score( storage=storage, results=traces, filename=f"traces_{eval_run_id}.json", - subdirectory=f"evaluations/score/{eval_run_id}" + subdirectory=f"evaluations/score/{eval_run_id}", ) if score_trace_url: logger.info( diff --git a/backend/app/tests/core/test_storage_utils.py b/backend/app/tests/core/test_storage_utils.py index 687230054..1df88168c 100644 --- a/backend/app/tests/core/test_storage_utils.py +++ b/backend/app/tests/core/test_storage_utils.py @@ -61,7 +61,9 @@ def test_load_success(self) -> None: """Verify successful load returns parsed JSON.""" mock_storage = MagicMock() test_data = [{"id": 1, "value": "test"}] - mock_storage.stream.return_value = BytesIO(json.dumps(test_data).encode("utf-8")) + mock_storage.stream.return_value = BytesIO( + json.dumps(test_data).encode("utf-8") + ) result = load_json_from_object_store( storage=mock_storage, diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py index 059880076..6c455b7ad 100644 --- a/backend/app/tests/crud/evaluations/test_score_storage.py +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -24,7 +24,13 @@ def mock_eval_run(self): @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_uploads_traces_to_s3_and_stores_summary_only( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify traces uploaded to S3, only summary_scores stored in DB.""" mock_get_eval.return_value = mock_eval_run @@ -44,7 +50,9 @@ def test_uploads_traces_to_s3_and_stores_summary_only( # Verify DB gets summary only, not traces call_kwargs = mock_update.call_args.kwargs - assert call_kwargs["score"] == {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} + assert call_kwargs["score"] == { + "summary_scores": [{"name": "accuracy", "avg": 0.9}] + } assert call_kwargs["score_trace_url"] == "s3://bucket/traces.json" @patch("app.crud.evaluations.core.update_evaluation_run") @@ -53,7 +61,13 @@ def test_uploads_traces_to_s3_and_stores_summary_only( @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_fallback_to_db_when_s3_fails( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify full score stored in DB when S3 upload fails.""" mock_get_eval.return_value = mock_eval_run @@ -78,7 +92,13 @@ def test_fallback_to_db_when_s3_fails( @patch("app.core.cloud.storage.get_cloud_storage") @patch("app.core.db.engine") def test_no_s3_upload_when_no_traces( - self, mock_engine, mock_get_storage, mock_upload, mock_get_eval, mock_update, mock_eval_run + self, + mock_engine, + mock_get_storage, + mock_upload, + mock_get_eval, + mock_update, + mock_eval_run, ) -> None: """Verify S3 upload skipped when traces is empty.""" mock_get_eval.return_value = mock_eval_run diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py index ea28d2aca..28a91615c 100644 --- a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -101,7 +101,10 @@ def test_resync_bypasses_cache_and_fetches_langfuse( """Verify resync=True skips S3/DB and fetches from Langfuse.""" mock_get_eval.return_value = completed_eval_run_with_s3 mock_get_langfuse.return_value = MagicMock() - mock_fetch_langfuse.return_value = {"summary_scores": [], "traces": [{"trace_id": "new"}]} + mock_fetch_langfuse.return_value = { + "summary_scores": [], + "traces": [{"trace_id": "new"}], + } mock_save_score.return_value = completed_eval_run_with_s3 get_evaluation_with_scores( From 788f3ae9f6185189b90eefca760ecd16ca670717 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Mon, 2 Feb 2026 10:47:24 +0530 Subject: [PATCH 06/15] bug fix --- backend/app/services/evaluations/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 3fc463086..42f1f44f8 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -259,7 +259,7 @@ def get_evaluation_with_scores( exc_info=True, ) - elif has_cached_traces_db: + if has_cached_traces_db: logger.info( f"[get_evaluation_with_scores] Returning traces from DB | " f"evaluation_id={evaluation_id}" From 5d91433feeb68def92f1b6f31ec66e8580ce99e5 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:10:49 +0530 Subject: [PATCH 07/15] - migration updated - add jsonl and json seprate upload object storage that support both openai batch response and traces from langfuse --- ...=> 044_add_score_trace_url_to_evaluation_run.py} | 4 ++-- backend/app/core/storage_utils.py | 13 ++++++++++--- backend/app/crud/evaluations/core.py | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) rename backend/app/alembic/versions/{043_add_score_trace_url_to_evaluation_run.py => 044_add_score_trace_url_to_evaluation_run.py} (93%) diff --git a/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py similarity index 93% rename from backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py rename to backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py index 19620401d..62a0f6324 100644 --- a/backend/app/alembic/versions/043_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py @@ -10,8 +10,8 @@ import sqlmodel.sql.sqltypes -revision = "043" -down_revision = "042" +revision = "044" +down_revision = "043" branch_labels = None depends_on = None diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index bf87033f0..dd3889d08 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -15,6 +15,7 @@ from starlette.datastructures import Headers, UploadFile from app.core.cloud.storage import CloudStorage, CloudStorageError +from typing import Literal logger = logging.getLogger(__name__) @@ -84,7 +85,7 @@ def __init__(self, content: bytes): def upload_jsonl_to_object_store( - storage: CloudStorage, results: list[dict], filename: str, subdirectory: str + storage: CloudStorage, results: list[dict], filename: str, subdirectory: str, format: Literal["json","jsonl"] = "jsonl" ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -110,12 +111,18 @@ def upload_jsonl_to_object_store( try: # Create file path file_path = Path(subdirectory) / filename + + if format == "jsonl": + jsonl_content = "\n".join(json.dumps(result, ensure_ascii=False) for result in results) + "\n" + content_type = {"content-type": "application/jsonl"} + else: + jsonl_content = json.dumps(results, ensure_ascii=False) + content_type = {"content-type": "application/json"} - jsonl_content = json.dumps(results, ensure_ascii=False) content_bytes = jsonl_content.encode("utf-8") # Create UploadFile-like object - headers = Headers({"content-type": "application/jsonl"}) + headers = Headers(content_type) upload_file = UploadFile( filename=filename, file=BytesIO(content_bytes), diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 7e2b593b2..43f83a5b2 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -364,6 +364,7 @@ def save_score( results=traces, filename=f"traces_{eval_run_id}.json", subdirectory=f"evaluations/score/{eval_run_id}", + format="json" ) if score_trace_url: logger.info( From 87a64ff12db6a7a45404245b4c13cf1a462b1d85 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:20:09 +0530 Subject: [PATCH 08/15] formatted using pre-commit and maintain PEP8 standards --- backend/app/core/storage_utils.py | 13 ++++++++++--- backend/app/crud/evaluations/core.py | 9 +++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index dd3889d08..aa069d076 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -85,7 +85,11 @@ def __init__(self, content: bytes): def upload_jsonl_to_object_store( - storage: CloudStorage, results: list[dict], filename: str, subdirectory: str, format: Literal["json","jsonl"] = "jsonl" + storage: CloudStorage, + results: list[dict], + filename: str, + subdirectory: str, + format: Literal["json", "jsonl"] = "jsonl", ) -> str | None: """ Upload JSONL (JSON Lines) content to object store. @@ -111,9 +115,12 @@ def upload_jsonl_to_object_store( try: # Create file path file_path = Path(subdirectory) / filename - + if format == "jsonl": - jsonl_content = "\n".join(json.dumps(result, ensure_ascii=False) for result in results) + "\n" + jsonl_content = ( + "\n".join(json.dumps(result, ensure_ascii=False) for result in results) + + "\n" + ) content_type = {"content-type": "application/jsonl"} else: jsonl_content = json.dumps(results, ensure_ascii=False) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 43f83a5b2..c3dce44e1 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -13,6 +13,10 @@ from app.models.llm.request import ConfigBlob, LLMCallConfig from app.services.llm.jobs import resolve_config_blob +from app.core.db import engine +from app.core.cloud.storage import get_cloud_storage +from app.core.storage_utils import upload_jsonl_to_object_store + logger = logging.getLogger(__name__) @@ -338,9 +342,6 @@ def save_score( Returns: Updated EvaluationRun instance, or None if not found """ - from app.core.db import engine - from app.core.cloud.storage import get_cloud_storage - from app.core.storage_utils import upload_jsonl_to_object_store with Session(engine) as session: eval_run = get_evaluation_run_by_id( @@ -364,7 +365,7 @@ def save_score( results=traces, filename=f"traces_{eval_run_id}.json", subdirectory=f"evaluations/score/{eval_run_id}", - format="json" + format="json", ) if score_trace_url: logger.info( From d662916c248b4f484e3f428f1c1acbb2a1ebeb77 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:40:36 +0530 Subject: [PATCH 09/15] restructure storage_utils tests for JSON and JSONL formats --- backend/app/tests/core/test_storage_utils.py | 51 +++++++++++++++++--- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/backend/app/tests/core/test_storage_utils.py b/backend/app/tests/core/test_storage_utils.py index 1df88168c..d0b392c70 100644 --- a/backend/app/tests/core/test_storage_utils.py +++ b/backend/app/tests/core/test_storage_utils.py @@ -13,33 +13,63 @@ ) -class TestUploadJsonlToObjectStore: - """Test uploading JSON content to object store.""" +class TestUploadToObjectStore: + """Test uploading content to object store.""" - def test_upload_success(self) -> None: - """Verify successful upload returns URL and content is correct.""" + # ==================== Upload Success Tests ==================== + + def test_upload_json_file_success(self) -> None: + """Verify successful JSON format upload returns URL with correct content.""" mock_storage = MagicMock() mock_storage.put.return_value = "s3://bucket/path/traces.json" - results = [{"trace_id": "t1", "score": 0.9}] + results = [{"trace_id": "t1", "score": 0.9}, {"trace_id": "t2", "score": 0.8}] url = upload_jsonl_to_object_store( storage=mock_storage, results=results, filename="traces.json", subdirectory="evaluations/score/70", + format="json", ) assert url == "s3://bucket/path/traces.json" mock_storage.put.assert_called_once() - # Verify content is valid JSON + # Verify content is valid JSON array call_args = mock_storage.put.call_args upload_file = call_args.kwargs.get("source") content = upload_file.file.read().decode("utf-8") assert json.loads(content) == results - def test_upload_returns_none_on_error(self) -> None: + def test_upload_jsonl_file_success(self) -> None: + """Verify successful JSONL format upload returns URL with correct content.""" + mock_storage = MagicMock() + mock_storage.put.return_value = "s3://bucket/path/traces.jsonl" + + results = [{"trace_id": "t1", "score": 0.9}, {"trace_id": "t2", "score": 0.8}] + + url = upload_jsonl_to_object_store( + storage=mock_storage, + results=results, + filename="traces.jsonl", + subdirectory="evaluations/score/70", + format="jsonl", + ) + + assert url == "s3://bucket/path/traces.jsonl" + mock_storage.put.assert_called_once() + + # Verify content is valid JSONL (one JSON object per line) + call_args = mock_storage.put.call_args + upload_file = call_args.kwargs.get("source") + content = upload_file.file.read().decode("utf-8") + parsed_results = [json.loads(line) for line in content.strip().split("\n")] + assert parsed_results == results + + # ==================== Upload Failure Tests ==================== + + def test_upload_returns_none_on_storage_error(self) -> None: """Verify returns None on CloudStorageError.""" mock_storage = MagicMock() mock_storage.put.side_effect = CloudStorageError("Upload failed") @@ -49,6 +79,7 @@ def test_upload_returns_none_on_error(self) -> None: results=[{"id": 1}], filename="test.json", subdirectory="test", + format="json", ) assert url is None @@ -57,6 +88,8 @@ def test_upload_returns_none_on_error(self) -> None: class TestLoadJsonFromObjectStore: """Test loading JSON from object store.""" + # ==================== Load Success Tests ==================== + def test_load_success(self) -> None: """Verify successful load returns parsed JSON.""" mock_storage = MagicMock() @@ -73,7 +106,9 @@ def test_load_success(self) -> None: assert result == test_data mock_storage.stream.assert_called_once_with("s3://bucket/path/file.json") - def test_load_returns_none_on_error(self) -> None: + # ==================== Load Failure Tests ==================== + + def test_load_returns_none_on_storage_error(self) -> None: """Verify returns None on CloudStorageError.""" mock_storage = MagicMock() mock_storage.stream.side_effect = CloudStorageError("Download failed") From 941d02a03a15dba0a6a5d0f65840ad353ef4496a Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:49:14 +0530 Subject: [PATCH 10/15] mocked patch for cloud storage --- .../app/tests/crud/evaluations/test_score_storage.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/app/tests/crud/evaluations/test_score_storage.py b/backend/app/tests/crud/evaluations/test_score_storage.py index 6c455b7ad..87a82845b 100644 --- a/backend/app/tests/crud/evaluations/test_score_storage.py +++ b/backend/app/tests/crud/evaluations/test_score_storage.py @@ -20,8 +20,8 @@ def mock_eval_run(self): @patch("app.crud.evaluations.core.update_evaluation_run") @patch("app.crud.evaluations.core.get_evaluation_run_by_id") - @patch("app.core.storage_utils.upload_jsonl_to_object_store") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.crud.evaluations.core.upload_jsonl_to_object_store") + @patch("app.crud.evaluations.core.get_cloud_storage") @patch("app.core.db.engine") def test_uploads_traces_to_s3_and_stores_summary_only( self, @@ -57,8 +57,8 @@ def test_uploads_traces_to_s3_and_stores_summary_only( @patch("app.crud.evaluations.core.update_evaluation_run") @patch("app.crud.evaluations.core.get_evaluation_run_by_id") - @patch("app.core.storage_utils.upload_jsonl_to_object_store") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.crud.evaluations.core.upload_jsonl_to_object_store") + @patch("app.crud.evaluations.core.get_cloud_storage") @patch("app.core.db.engine") def test_fallback_to_db_when_s3_fails( self, @@ -88,8 +88,8 @@ def test_fallback_to_db_when_s3_fails( @patch("app.crud.evaluations.core.update_evaluation_run") @patch("app.crud.evaluations.core.get_evaluation_run_by_id") - @patch("app.core.storage_utils.upload_jsonl_to_object_store") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.crud.evaluations.core.upload_jsonl_to_object_store") + @patch("app.crud.evaluations.core.get_cloud_storage") @patch("app.core.db.engine") def test_no_s3_upload_when_no_traces( self, From cc0ca16f31c329c313e279cb348351e1f46a132b Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 14:09:12 +0530 Subject: [PATCH 11/15] maintained PEP8 standards --- backend/app/services/evaluations/evaluation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 42f1f44f8..1ec364f27 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -18,6 +18,8 @@ from app.models.evaluation import EvaluationRun from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client +from app.core.cloud.storage import get_cloud_storage +from app.core.storage_utils import load_json_from_object_store logger = logging.getLogger(__name__) @@ -189,8 +191,7 @@ def get_evaluation_with_scores( Returns: Tuple of (EvaluationRun or None, error_message or None) """ - from app.core.cloud.storage import get_cloud_storage - from app.core.storage_utils import load_json_from_object_store + logger.info( f"[get_evaluation_with_scores] Fetching status for evaluation run | " From 61c9082da6ec5db2212c3fe6000621ad284764b3 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:20:24 +0530 Subject: [PATCH 12/15] code-rabbit proposed changes resolved --- ...4_add_score_trace_url_to_evaluation_run.py | 4 +- backend/app/core/storage_utils.py | 2 +- backend/app/crud/evaluations/core.py | 2 +- .../evaluations/test_evaluation_service_s3.py | 104 ++++++++++++------ 4 files changed, 72 insertions(+), 40 deletions(-) diff --git a/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py b/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py index 62a0f6324..bc1f94c62 100644 --- a/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py +++ b/backend/app/alembic/versions/044_add_score_trace_url_to_evaluation_run.py @@ -1,7 +1,7 @@ """Add score_trace_url to evaluation_run -Revision ID: 043 -Revises: 042 +Revision ID: 044 +Revises: 043 Create Date: 2026-01-24 19:34:46.763908 """ diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py index aa069d076..a3e8c720d 100644 --- a/backend/app/core/storage_utils.py +++ b/backend/app/core/storage_utils.py @@ -162,7 +162,7 @@ def upload_jsonl_to_object_store( return None -def load_json_from_object_store(storage: CloudStorage, url: str) -> list | None: +def load_json_from_object_store(storage: CloudStorage, url: str) -> list | dict | None: logger.info(f"[load_json_from_object_store] Loading JSON from '{url}") try: body = storage.stream(url) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index c3dce44e1..b247dfcae 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -386,7 +386,7 @@ def save_score( ) # IF TRACES DATA IS STORED IN S3 URL THEN HERE WE ARE JUST STORING THE SUMMARY SCORE - # TODO: Evaluate weather this behaviour is needed or completely discard the storing data in db + # TODO: Evaluate whether this behaviour is needed or completely discard the storing data in db if score_trace_url: db_score = {"summary_scores": summary_score} else: diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py index 28a91615c..e63fba7da 100644 --- a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -1,5 +1,7 @@ """Tests for get_evaluation_with_scores() S3 retrieval.""" +from collections.abc import Callable +from typing import Optional from unittest.mock import MagicMock, patch import pytest @@ -12,38 +14,49 @@ class TestGetEvaluationWithScoresS3: """Test get_evaluation_with_scores() S3 retrieval.""" @pytest.fixture - def completed_eval_run_with_s3(self): - """Completed eval run with S3 URL.""" - eval_run = MagicMock(spec=EvaluationRun) - eval_run.id = 100 - eval_run.status = "completed" - eval_run.score = {"summary_scores": [{"name": "accuracy", "avg": 0.9}]} - eval_run.score_trace_url = "s3://bucket/traces.json" - eval_run.dataset_name = "test_dataset" - eval_run.run_name = "test_run" - return eval_run - - @pytest.fixture - def completed_eval_run_with_db_traces(self): - """Completed eval run with traces in DB.""" - eval_run = MagicMock(spec=EvaluationRun) - eval_run.id = 101 - eval_run.status = "completed" - eval_run.score = { - "summary_scores": [{"name": "accuracy", "avg": 0.85}], - "traces": [{"trace_id": "db_trace"}], - } - eval_run.score_trace_url = None - return eval_run + def eval_run_factory(self) -> Callable[..., MagicMock]: + """Factory that creates a MagicMock(spec=EvaluationRun) with given attrs.""" + + def _factory( + *, + id: int, + status: str, + score: dict, + score_trace_url: Optional[str] = None, + dataset_name: Optional[str] = None, + run_name: Optional[str] = None, + ) -> MagicMock: + eval_run = MagicMock(spec=EvaluationRun) + eval_run.id = id + eval_run.status = status + eval_run.score = score + eval_run.score_trace_url = score_trace_url + eval_run.dataset_name = dataset_name + eval_run.run_name = run_name + return eval_run + + return _factory @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") @patch("app.core.storage_utils.load_json_from_object_store") @patch("app.core.cloud.storage.get_cloud_storage") def test_loads_traces_from_s3( - self, mock_get_storage, mock_load, mock_get_eval, completed_eval_run_with_s3 + self, + mock_get_storage: MagicMock, + mock_load: MagicMock, + mock_get_eval: MagicMock, + eval_run_factory: Callable[..., MagicMock], ) -> None: """Verify traces loaded from S3 and score reconstructed.""" - mock_get_eval.return_value = completed_eval_run_with_s3 + eval_run = eval_run_factory( + id=100, + status="completed", + score={"summary_scores": [{"name": "accuracy", "avg": 0.9}]}, + score_trace_url="s3://bucket/traces.json", + dataset_name="test_dataset", + run_name="test_run", + ) + mock_get_eval.return_value = eval_run mock_get_storage.return_value = MagicMock() mock_load.return_value = [{"trace_id": "s3_trace"}] @@ -64,10 +77,21 @@ def test_loads_traces_from_s3( @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") @patch("app.core.cloud.storage.get_cloud_storage") def test_returns_db_traces_when_no_s3_url( - self, mock_get_storage, mock_get_eval, completed_eval_run_with_db_traces + self, + mock_get_storage: MagicMock, + mock_get_eval: MagicMock, + eval_run_factory: Callable[..., MagicMock], ) -> None: """Verify DB traces returned when no S3 URL.""" - mock_get_eval.return_value = completed_eval_run_with_db_traces + eval_run = eval_run_factory( + id=101, + status="completed", + score={ + "summary_scores": [{"name": "accuracy", "avg": 0.85}], + "traces": [{"trace_id": "db_trace"}], + }, + ) + mock_get_eval.return_value = eval_run result, error = get_evaluation_with_scores( session=MagicMock(), @@ -90,22 +114,30 @@ def test_returns_db_traces_when_no_s3_url( @patch("app.core.cloud.storage.get_cloud_storage") def test_resync_bypasses_cache_and_fetches_langfuse( self, - mock_get_storage, - mock_load, - mock_get_eval, - mock_get_langfuse, - mock_fetch_langfuse, - mock_save_score, - completed_eval_run_with_s3, + mock_get_storage: MagicMock, + mock_load: MagicMock, + mock_get_eval: MagicMock, + mock_get_langfuse: MagicMock, + mock_fetch_langfuse: MagicMock, + mock_save_score: MagicMock, + eval_run_factory: Callable[..., MagicMock], ) -> None: """Verify resync=True skips S3/DB and fetches from Langfuse.""" - mock_get_eval.return_value = completed_eval_run_with_s3 + eval_run = eval_run_factory( + id=100, + status="completed", + score={"summary_scores": [{"name": "accuracy", "avg": 0.9}]}, + score_trace_url="s3://bucket/traces.json", + dataset_name="test_dataset", + run_name="test_run", + ) + mock_get_eval.return_value = eval_run mock_get_langfuse.return_value = MagicMock() mock_fetch_langfuse.return_value = { "summary_scores": [], "traces": [{"trace_id": "new"}], } - mock_save_score.return_value = completed_eval_run_with_s3 + mock_save_score.return_value = eval_run get_evaluation_with_scores( session=MagicMock(), From dccf47048b55e25605007ed78c3ee659cdf58b1c Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:21:47 +0530 Subject: [PATCH 13/15] code-formatted properly --- backend/app/services/evaluations/evaluation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 1ec364f27..8c3d01656 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -191,7 +191,6 @@ def get_evaluation_with_scores( Returns: Tuple of (EvaluationRun or None, error_message or None) """ - logger.info( f"[get_evaluation_with_scores] Fetching status for evaluation run | " From 8c0fe5297a7aafb38cd6869d649431ca5fe5d1f4 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:32:04 +0530 Subject: [PATCH 14/15] fix: correct mock patch paths in S3 evaluation tests --- .../services/evaluations/test_evaluation_service_s3.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py index e63fba7da..60465b876 100644 --- a/backend/app/tests/services/evaluations/test_evaluation_service_s3.py +++ b/backend/app/tests/services/evaluations/test_evaluation_service_s3.py @@ -38,8 +38,8 @@ def _factory( return _factory @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") - @patch("app.core.storage_utils.load_json_from_object_store") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.services.evaluations.evaluation.load_json_from_object_store") + @patch("app.services.evaluations.evaluation.get_cloud_storage") def test_loads_traces_from_s3( self, mock_get_storage: MagicMock, @@ -75,7 +75,7 @@ def test_loads_traces_from_s3( assert result.score["summary_scores"] == [{"name": "accuracy", "avg": 0.9}] @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.services.evaluations.evaluation.get_cloud_storage") def test_returns_db_traces_when_no_s3_url( self, mock_get_storage: MagicMock, @@ -110,8 +110,8 @@ def test_returns_db_traces_when_no_s3_url( @patch("app.services.evaluations.evaluation.fetch_trace_scores_from_langfuse") @patch("app.services.evaluations.evaluation.get_langfuse_client") @patch("app.services.evaluations.evaluation.get_evaluation_run_by_id") - @patch("app.core.storage_utils.load_json_from_object_store") - @patch("app.core.cloud.storage.get_cloud_storage") + @patch("app.services.evaluations.evaluation.load_json_from_object_store") + @patch("app.services.evaluations.evaluation.get_cloud_storage") def test_resync_bypasses_cache_and_fetches_langfuse( self, mock_get_storage: MagicMock, From a43cdf05ff242439a17799d6b14e90aa38b24063 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:38:51 +0530 Subject: [PATCH 15/15] added expilct type annotation for scre-trace-variable --- backend/app/crud/evaluations/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b247dfcae..81c5e2480 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -225,7 +225,7 @@ def update_evaluation_run( if embedding_batch_job_id is not None: eval_run.embedding_batch_job_id = embedding_batch_job_id if score_trace_url is not None: - eval_run.score_trace_url = score_trace_url + eval_run.score_trace_url = score_trace_url or None # Always update timestamp eval_run.updated_at = now() @@ -355,7 +355,7 @@ def save_score( traces = score.get("traces", []) summary_score = score.get("summary_scores", []) - score_trace_url = None + score_trace_url: str | None = "" if not traces else None if traces: try: