diff --git a/api-reference/openapi-evals.json b/api-reference/openapi-evals.json index 14c6691..841e4a4 100644 --- a/api-reference/openapi-evals.json +++ b/api-reference/openapi-evals.json @@ -17,216 +17,24 @@ } ], "paths": { - "/azure/content_safety/evaluate": { - "post": { - "summary": "Azure Content Safety", - "description": "This evaluator detects potentially unsafe content in text, including hate speech,\nself-harm, sexual content, and violence. It allows customization of the severity\nthreshold and the specific categories to check.", - "operationId": "azure_content_safety_evaluate", - "requestBody": { - "content": { - "application/json": { - "schema": { - "allOf": [ - { - "$ref": "#/components/schemas/azure_content_safetyRequest" - }, - { - "type": "object", - "properties": { - "settings": { - "$ref": "#/components/schemas/azure_content_safetySettings" - } - } - } - ] - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful evaluation", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvaluationResult" - } - } - } - } - }, - "400": { - "description": "Bad request", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "detail": { - "type": "string" - } - } - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "detail": { - "type": "string" - } - } - } - } - } - } - }, - "x-codeSamples": [ - { - "lang": "python", - "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/content_safety\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" - }, - { - "lang": "python", - "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/content_safety\",\n name=\"My Azure Content Safety Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" - }, - { - "lang": "typescript", - "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/content_safety\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" - }, - { - "lang": "typescript", - "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/content_safety\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" - } - ] - } - }, - "/azure/jailbreak/evaluate": { - "post": { - "summary": "Azure Jailbreak Detection", - "description": "This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API.", - "operationId": "azure_jailbreak_evaluate", - "requestBody": { - "content": { - "application/json": { - "schema": { - "allOf": [ - { - "$ref": "#/components/schemas/azure_jailbreakRequest" - }, - { - "type": "object", - "properties": { - "settings": { - "$ref": "#/components/schemas/azure_jailbreakSettings" - } - } - } - ] - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful evaluation", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvaluationResult" - } - } - } - } - }, - "400": { - "description": "Bad request", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "detail": { - "type": "string" - } - } - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "detail": { - "type": "string" - } - } - } - } - } - } - }, - "x-codeSamples": [ - { - "lang": "python", - "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/jailbreak\",\n index=index,\n data={\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" - }, - { - "lang": "python", - "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/jailbreak\",\n name=\"My Azure Jailbreak Detection Check\",\n data={\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" - }, - { - "lang": "typescript", - "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/jailbreak\", {\n index,\n data: {\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" - }, - { - "lang": "typescript", - "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/jailbreak\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" - } - ] - } - }, - "/azure/prompt_injection/evaluate": { + "/openai/moderation/evaluate": { "post": { - "summary": "Azure Prompt Shield", - "description": "This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API.", - "operationId": "azure_prompt_injection_evaluate", + "summary": "OpenAI Moderation", + "description": "This evaluator uses OpenAI's moderation API to detect potentially harmful content in text,\nincluding harassment, hate speech, self-harm, sexual content, and violence.", + "operationId": "openai_moderation_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/azure_prompt_injectionRequest" + "$ref": "#/components/schemas/openai_moderationRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/azure_prompt_injectionSettings" + "$ref": "#/components/schemas/openai_moderationSettings" } } } @@ -285,22 +93,22 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/prompt_injection\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"openai/moderation\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/prompt_injection\",\n name=\"My Azure Prompt Shield Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"openai/moderation\",\n name=\"My OpenAI Moderation Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/prompt_injection\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"openai/moderation\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/prompt_injection\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"openai/moderation\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } @@ -1553,24 +1361,24 @@ ] } }, - "/presidio/pii_detection/evaluate": { + "/ragas/bleu_score/evaluate": { "post": { - "summary": "Presidio PII Detection", - "description": "Detects personally identifiable information in text, including phone numbers, email addresses, and\nsocial security numbers. It allows customization of the detection threshold and the specific types of PII to check.", - "operationId": "presidio_pii_detection_evaluate", + "summary": "BLEU Score", + "description": "Traditional NLP metric. BLEU score for evaluating the similarity between two strings.", + "operationId": "ragas_bleu_score_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/presidio_pii_detectionRequest" + "$ref": "#/components/schemas/ragas_bleu_scoreRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/presidio_pii_detectionSettings" + "$ref": "#/components/schemas/ragas_bleu_scoreSettings" } } } @@ -1629,44 +1437,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"presidio/pii_detection\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/bleu_score\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"presidio/pii_detection\",\n name=\"My Presidio PII Detection Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/bleu_score\",\n name=\"My BLEU Score Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"presidio/pii_detection\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/bleu_score\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"presidio/pii_detection\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/bleu_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/lingua/language_detection/evaluate": { + "/ragas/context_f1/evaluate": { "post": { - "summary": "Lingua Language Detection", - "description": "This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt,\nor if it's in a specific expected language.", - "operationId": "lingua_language_detection_evaluate", + "summary": "Context F1", + "description": "Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics.", + "operationId": "ragas_context_f1_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/lingua_language_detectionRequest" + "$ref": "#/components/schemas/ragas_context_f1Request" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/lingua_language_detectionSettings" + "$ref": "#/components/schemas/ragas_context_f1Settings" } } } @@ -1725,44 +1533,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"lingua/language_detection\",\n index=index,\n data={\n \"output\": output,\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_f1\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"lingua/language_detection\",\n name=\"My Lingua Language Detection Check\",\n data={\n \"output\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_f1\",\n name=\"My Context F1 Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"lingua/language_detection\", {\n index,\n data: {\n output: output,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_f1\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"lingua/language_detection\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_f1\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_answer_correctness/evaluate": { + "/ragas/context_precision/evaluate": { "post": { - "summary": "Ragas Answer Correctness", - "description": "Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.", - "operationId": "legacy_ragas_answer_correctness_evaluate", + "summary": "Context Precision", + "description": "Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics.", + "operationId": "ragas_context_precision_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_answer_correctnessRequest" + "$ref": "#/components/schemas/ragas_context_precisionRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_answer_correctnessSettings" + "$ref": "#/components/schemas/ragas_context_precisionSettings" } } } @@ -1821,44 +1629,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_answer_correctness\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_precision\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_answer_correctness\",\n name=\"My Ragas Answer Correctness Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_precision\",\n name=\"My Context Precision Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_answer_correctness\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_precision\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_answer_correctness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_precision\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_answer_relevancy/evaluate": { + "/ragas/context_recall/evaluate": { "post": { - "summary": "Ragas Answer Relevancy", - "description": "Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.", - "operationId": "legacy_ragas_answer_relevancy_evaluate", + "summary": "Context Recall", + "description": "Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics.", + "operationId": "ragas_context_recall_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_answer_relevancyRequest" + "$ref": "#/components/schemas/ragas_context_recallRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_answer_relevancySettings" + "$ref": "#/components/schemas/ragas_context_recallSettings" } } } @@ -1917,44 +1725,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_answer_relevancy\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_recall\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_answer_relevancy\",\n name=\"My Ragas Answer Relevancy Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_recall\",\n name=\"My Context Recall Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_answer_relevancy\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_recall\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_answer_relevancy\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_recall\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_context_precision/evaluate": { + "/ragas/factual_correctness/evaluate": { "post": { - "summary": "Ragas Context Precision", - "description": "This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.", - "operationId": "legacy_ragas_context_precision_evaluate", + "summary": "LLM Factual Match", + "description": "Computes with an LLM how factually similar the generated answer is to the expected output.", + "operationId": "ragas_factual_correctness_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_context_precisionRequest" + "$ref": "#/components/schemas/ragas_factual_correctnessRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_context_precisionSettings" + "$ref": "#/components/schemas/ragas_factual_correctnessSettings" } } } @@ -2013,44 +1821,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_precision\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/factual_correctness\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_precision\",\n name=\"My Ragas Context Precision Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/factual_correctness\",\n name=\"My LLM Factual Match Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_precision\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/factual_correctness\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_precision\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/factual_correctness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_context_recall/evaluate": { + "/ragas/faithfulness/evaluate": { "post": { - "summary": "Ragas Context Recall", - "description": "This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.", - "operationId": "legacy_ragas_context_recall_evaluate", + "summary": "Ragas Faithfulness", + "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.", + "operationId": "ragas_faithfulness_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_context_recallRequest" + "$ref": "#/components/schemas/ragas_faithfulnessRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_context_recallSettings" + "$ref": "#/components/schemas/ragas_faithfulnessSettings" } } } @@ -2109,44 +1917,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_recall\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/faithfulness\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_recall\",\n name=\"My Ragas Context Recall Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/faithfulness\",\n name=\"My Ragas Faithfulness Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_recall\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/faithfulness\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_recall\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/faithfulness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_context_relevancy/evaluate": { + "/ragas/response_context_precision/evaluate": { "post": { - "summary": "Ragas Context Relevancy", - "description": "This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.", - "operationId": "legacy_ragas_context_relevancy_evaluate", + "summary": "Ragas Response Context Precision", + "description": "Uses an LLM to measure the proportion of chunks in the retrieved context that were relevant to generate the output or the expected output.", + "operationId": "ragas_response_context_precision_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_context_relevancyRequest" + "$ref": "#/components/schemas/ragas_response_context_precisionRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_context_relevancySettings" + "$ref": "#/components/schemas/ragas_response_context_precisionSettings" } } } @@ -2205,44 +2013,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_relevancy\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_context_precision\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_relevancy\",\n name=\"My Ragas Context Relevancy Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_context_precision\",\n name=\"My Ragas Response Context Precision Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_relevancy\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_context_precision\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_relevancy\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_context_precision\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_context_utilization/evaluate": { + "/ragas/response_context_recall/evaluate": { "post": { - "summary": "Ragas Context Utilization", - "description": "This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.", - "operationId": "legacy_ragas_context_utilization_evaluate", + "summary": "Ragas Response Context Recall", + "description": "Uses an LLM to measure how many of relevant documents attributable the claims in the output were successfully retrieved in order to generate an expected output.", + "operationId": "ragas_response_context_recall_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_context_utilizationRequest" + "$ref": "#/components/schemas/ragas_response_context_recallRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_context_utilizationSettings" + "$ref": "#/components/schemas/ragas_response_context_recallSettings" } } } @@ -2301,44 +2109,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_utilization\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_context_recall\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_utilization\",\n name=\"My Ragas Context Utilization Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_context_recall\",\n name=\"My Ragas Response Context Recall Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_utilization\", {\n index,\n data: {\n input: item.input,\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_context_recall\", {\n index,\n data: {\n input: item.input,\n output: output,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_utilization\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_context_recall\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/legacy/ragas_faithfulness/evaluate": { + "/ragas/response_relevancy/evaluate": { "post": { - "summary": "Ragas Faithfulness", - "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.", - "operationId": "legacy_ragas_faithfulness_evaluate", + "summary": "Ragas Response Relevancy", + "description": "Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.", + "operationId": "ragas_response_relevancy_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/legacy_ragas_faithfulnessRequest" + "$ref": "#/components/schemas/ragas_response_relevancyRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/legacy_ragas_faithfulnessSettings" + "$ref": "#/components/schemas/ragas_response_relevancySettings" } } } @@ -2397,44 +2205,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_faithfulness\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_relevancy\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_faithfulness\",\n name=\"My Ragas Faithfulness Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_relevancy\",\n name=\"My Ragas Response Relevancy Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_faithfulness\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_relevancy\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_faithfulness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_relevancy\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/openai/moderation/evaluate": { + "/ragas/rouge_score/evaluate": { "post": { - "summary": "OpenAI Moderation", - "description": "This evaluator uses OpenAI's moderation API to detect potentially harmful content in text,\nincluding harassment, hate speech, self-harm, sexual content, and violence.", - "operationId": "openai_moderation_evaluate", + "summary": "ROUGE Score", + "description": "Traditional NLP metric. ROUGE score for evaluating the similarity between two strings.", + "operationId": "ragas_rouge_score_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/openai_moderationRequest" + "$ref": "#/components/schemas/ragas_rouge_scoreRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/openai_moderationSettings" + "$ref": "#/components/schemas/ragas_rouge_scoreSettings" } } } @@ -2493,44 +2301,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"openai/moderation\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/rouge_score\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"openai/moderation\",\n name=\"My OpenAI Moderation Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/rouge_score\",\n name=\"My ROUGE Score Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"openai/moderation\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/rouge_score\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"openai/moderation\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/rouge_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/bleu_score/evaluate": { + "/ragas/rubrics_based_scoring/evaluate": { "post": { - "summary": "BLEU Score", - "description": "Traditional NLP metric. BLEU score for evaluating the similarity between two strings.", - "operationId": "ragas_bleu_score_evaluate", + "summary": "Rubrics Based Scoring", + "description": "Rubric-based evaluation metric that is used to evaluate responses. The rubric consists of descriptions for each score, typically ranging from 1 to 5", + "operationId": "ragas_rubrics_based_scoring_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_bleu_scoreRequest" + "$ref": "#/components/schemas/ragas_rubrics_based_scoringRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_bleu_scoreSettings" + "$ref": "#/components/schemas/ragas_rubrics_based_scoringSettings" } } } @@ -2589,44 +2397,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/bleu_score\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/rubrics_based_scoring\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/bleu_score\",\n name=\"My BLEU Score Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/rubrics_based_scoring\",\n name=\"My Rubrics Based Scoring Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/bleu_score\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/rubrics_based_scoring\", {\n index,\n data: {\n input: item.input,\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/bleu_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/rubrics_based_scoring\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/context_f1/evaluate": { + "/ragas/sql_query_equivalence/evaluate": { "post": { - "summary": "Context F1", - "description": "Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics.", - "operationId": "ragas_context_f1_evaluate", + "summary": "SQL Query Equivalence", + "description": "Checks if the SQL query is equivalent to a reference one by using an LLM to infer if it would generate the same results given the table schemas.", + "operationId": "ragas_sql_query_equivalence_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_context_f1Request" + "$ref": "#/components/schemas/ragas_sql_query_equivalenceRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_context_f1Settings" + "$ref": "#/components/schemas/ragas_sql_query_equivalenceSettings" } } } @@ -2685,44 +2493,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_f1\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/sql_query_equivalence\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_f1\",\n name=\"My Context F1 Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/sql_query_equivalence\",\n name=\"My SQL Query Equivalence Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_f1\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/sql_query_equivalence\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_f1\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/sql_query_equivalence\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/context_precision/evaluate": { + "/ragas/summarization_score/evaluate": { "post": { - "summary": "Context Precision", - "description": "Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics.", - "operationId": "ragas_context_precision_evaluate", + "summary": "Summarization Score", + "description": "Measures how well the summary captures important information from the retrieved contexts.", + "operationId": "ragas_summarization_score_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_context_precisionRequest" + "$ref": "#/components/schemas/ragas_summarization_scoreRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_context_precisionSettings" + "$ref": "#/components/schemas/ragas_summarization_scoreSettings" } } } @@ -2781,44 +2589,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_precision\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/summarization_score\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_precision\",\n name=\"My Context Precision Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/summarization_score\",\n name=\"My Summarization Score Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_precision\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/summarization_score\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_precision\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/summarization_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/context_recall/evaluate": { + "/lingua/language_detection/evaluate": { "post": { - "summary": "Context Recall", - "description": "Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics.", - "operationId": "ragas_context_recall_evaluate", + "summary": "Lingua Language Detection", + "description": "This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt,\nor if it's in a specific expected language.", + "operationId": "lingua_language_detection_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_context_recallRequest" + "$ref": "#/components/schemas/lingua_language_detectionRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_context_recallSettings" + "$ref": "#/components/schemas/lingua_language_detectionSettings" } } } @@ -2877,44 +2685,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/context_recall\",\n index=index,\n data={\n \"contexts\": row[\"contexts\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"lingua/language_detection\",\n index=index,\n data={\n \"output\": output,\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/context_recall\",\n name=\"My Context Recall Check\",\n data={\n \"contexts\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"lingua/language_detection\",\n name=\"My Lingua Language Detection Check\",\n data={\n \"output\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/context_recall\", {\n index,\n data: {\n contexts: item.contexts,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"lingua/language_detection\", {\n index,\n data: {\n output: output,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/context_recall\", {\n name: \"my-evaluation\",\n data: {\n contexts: \"\", // your contexts value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"lingua/language_detection\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/factual_correctness/evaluate": { + "/legacy/ragas_answer_correctness/evaluate": { "post": { - "summary": "LLM Factual Match", - "description": "Computes with an LLM how factually similar the generated answer is to the expected output.", - "operationId": "ragas_factual_correctness_evaluate", + "summary": "Ragas Answer Correctness", + "description": "Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.", + "operationId": "legacy_ragas_answer_correctness_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_factual_correctnessRequest" + "$ref": "#/components/schemas/legacy_ragas_answer_correctnessRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_factual_correctnessSettings" + "$ref": "#/components/schemas/legacy_ragas_answer_correctnessSettings" } } } @@ -2973,44 +2781,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/factual_correctness\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_answer_correctness\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/factual_correctness\",\n name=\"My LLM Factual Match Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_answer_correctness\",\n name=\"My Ragas Answer Correctness Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/factual_correctness\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_answer_correctness\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/factual_correctness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_answer_correctness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/faithfulness/evaluate": { + "/legacy/ragas_answer_relevancy/evaluate": { "post": { - "summary": "Ragas Faithfulness", - "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.", - "operationId": "ragas_faithfulness_evaluate", + "summary": "Ragas Answer Relevancy", + "description": "Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.", + "operationId": "legacy_ragas_answer_relevancy_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_faithfulnessRequest" + "$ref": "#/components/schemas/legacy_ragas_answer_relevancyRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_faithfulnessSettings" + "$ref": "#/components/schemas/legacy_ragas_answer_relevancySettings" } } } @@ -3069,44 +2877,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/faithfulness\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_answer_relevancy\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/faithfulness\",\n name=\"My Ragas Faithfulness Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_answer_relevancy\",\n name=\"My Ragas Answer Relevancy Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/faithfulness\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_answer_relevancy\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/faithfulness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_answer_relevancy\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/response_context_precision/evaluate": { + "/legacy/ragas_context_precision/evaluate": { "post": { - "summary": "Ragas Response Context Precision", - "description": "Uses an LLM to measure the proportion of chunks in the retrieved context that were relevant to generate the output or the expected output.", - "operationId": "ragas_response_context_precision_evaluate", + "summary": "Ragas Context Precision", + "description": "This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.", + "operationId": "legacy_ragas_context_precision_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_response_context_precisionRequest" + "$ref": "#/components/schemas/legacy_ragas_context_precisionRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_response_context_precisionSettings" + "$ref": "#/components/schemas/legacy_ragas_context_precisionSettings" } } } @@ -3165,44 +2973,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_context_precision\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_precision\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_context_precision\",\n name=\"My Ragas Response Context Precision Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_precision\",\n name=\"My Ragas Context Precision Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_context_precision\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_precision\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_context_precision\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_precision\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/response_context_recall/evaluate": { + "/legacy/ragas_context_recall/evaluate": { "post": { - "summary": "Ragas Response Context Recall", - "description": "Uses an LLM to measure how many of relevant documents attributable the claims in the output were successfully retrieved in order to generate an expected output.", - "operationId": "ragas_response_context_recall_evaluate", + "summary": "Ragas Context Recall", + "description": "This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.", + "operationId": "legacy_ragas_context_recall_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_response_context_recallRequest" + "$ref": "#/components/schemas/legacy_ragas_context_recallRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_response_context_recallSettings" + "$ref": "#/components/schemas/legacy_ragas_context_recallSettings" } } } @@ -3261,44 +3069,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_context_recall\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_recall\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_context_recall\",\n name=\"My Ragas Response Context Recall Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_recall\",\n name=\"My Ragas Context Recall Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_context_recall\", {\n index,\n data: {\n input: item.input,\n output: output,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_recall\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_context_recall\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_recall\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/response_relevancy/evaluate": { + "/legacy/ragas_context_relevancy/evaluate": { "post": { - "summary": "Ragas Response Relevancy", - "description": "Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.", - "operationId": "ragas_response_relevancy_evaluate", + "summary": "Ragas Context Relevancy", + "description": "This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.", + "operationId": "legacy_ragas_context_relevancy_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_response_relevancyRequest" + "$ref": "#/components/schemas/legacy_ragas_context_relevancyRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_response_relevancySettings" + "$ref": "#/components/schemas/legacy_ragas_context_relevancySettings" } } } @@ -3357,44 +3165,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/response_relevancy\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_relevancy\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/response_relevancy\",\n name=\"My Ragas Response Relevancy Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_relevancy\",\n name=\"My Ragas Context Relevancy Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/response_relevancy\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_relevancy\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/response_relevancy\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_relevancy\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/rouge_score/evaluate": { + "/legacy/ragas_context_utilization/evaluate": { "post": { - "summary": "ROUGE Score", - "description": "Traditional NLP metric. ROUGE score for evaluating the similarity between two strings.", - "operationId": "ragas_rouge_score_evaluate", + "summary": "Ragas Context Utilization", + "description": "This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.", + "operationId": "legacy_ragas_context_utilization_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_rouge_scoreRequest" + "$ref": "#/components/schemas/legacy_ragas_context_utilizationRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_rouge_scoreSettings" + "$ref": "#/components/schemas/legacy_ragas_context_utilizationSettings" } } } @@ -3453,44 +3261,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/rouge_score\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_context_utilization\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/rouge_score\",\n name=\"My ROUGE Score Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_context_utilization\",\n name=\"My Ragas Context Utilization Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/rouge_score\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_context_utilization\", {\n index,\n data: {\n input: item.input,\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/rouge_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_context_utilization\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/rubrics_based_scoring/evaluate": { + "/legacy/ragas_faithfulness/evaluate": { "post": { - "summary": "Rubrics Based Scoring", - "description": "Rubric-based evaluation metric that is used to evaluate responses. The rubric consists of descriptions for each score, typically ranging from 1 to 5", - "operationId": "ragas_rubrics_based_scoring_evaluate", + "summary": "Ragas Faithfulness", + "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.", + "operationId": "legacy_ragas_faithfulness_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_rubrics_based_scoringRequest" + "$ref": "#/components/schemas/legacy_ragas_faithfulnessRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_rubrics_based_scoringSettings" + "$ref": "#/components/schemas/legacy_ragas_faithfulnessSettings" } } } @@ -3549,44 +3357,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/rubrics_based_scoring\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"legacy/ragas_faithfulness\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/rubrics_based_scoring\",\n name=\"My Rubrics Based Scoring Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n \"expected_output\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"legacy/ragas_faithfulness\",\n name=\"My Ragas Faithfulness Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/rubrics_based_scoring\", {\n index,\n data: {\n input: item.input,\n output: output,\n expected_output: item.expected_output,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"legacy/ragas_faithfulness\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/rubrics_based_scoring\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"legacy/ragas_faithfulness\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/sql_query_equivalence/evaluate": { + "/azure/content_safety/evaluate": { "post": { - "summary": "SQL Query Equivalence", - "description": "Checks if the SQL query is equivalent to a reference one by using an LLM to infer if it would generate the same results given the table schemas.", - "operationId": "ragas_sql_query_equivalence_evaluate", + "summary": "Azure Content Safety", + "description": "This evaluator detects potentially unsafe content in text, including hate speech,\nself-harm, sexual content, and violence. It allows customization of the severity\nthreshold and the specific categories to check.", + "operationId": "azure_content_safety_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_sql_query_equivalenceRequest" + "$ref": "#/components/schemas/azure_content_safetyRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_sql_query_equivalenceSettings" + "$ref": "#/components/schemas/azure_content_safetySettings" } } } @@ -3645,44 +3453,44 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/sql_query_equivalence\",\n index=index,\n data={\n \"output\": output,\n \"expected_output\": row[\"expected_output\"],\n \"expected_contexts\": row[\"expected_contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/content_safety\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/sql_query_equivalence\",\n name=\"My SQL Query Equivalence Check\",\n data={\n \"output\": \"\",\n \"expected_output\": \"\",\n \"expected_contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/content_safety\",\n name=\"My Azure Content Safety Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/sql_query_equivalence\", {\n index,\n data: {\n output: output,\n expected_output: item.expected_output,\n expected_contexts: item.expected_contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/content_safety\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/sql_query_equivalence\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n expected_output: \"\", // your expected_output value\n expected_contexts: \"\", // your expected_contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/content_safety\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } }, - "/ragas/summarization_score/evaluate": { + "/azure/jailbreak/evaluate": { "post": { - "summary": "Summarization Score", - "description": "Measures how well the summary captures important information from the retrieved contexts.", - "operationId": "ragas_summarization_score_evaluate", + "summary": "Azure Jailbreak Detection", + "description": "This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API.", + "operationId": "azure_jailbreak_evaluate", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { - "$ref": "#/components/schemas/ragas_summarization_scoreRequest" + "$ref": "#/components/schemas/azure_jailbreakRequest" }, { "type": "object", "properties": { "settings": { - "$ref": "#/components/schemas/ragas_summarization_scoreSettings" + "$ref": "#/components/schemas/azure_jailbreakSettings" } } } @@ -3741,160 +3549,339 @@ { "lang": "python", "label": "Experiment", - "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"ragas/summarization_score\",\n index=index,\n data={\n \"output\": output,\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/jailbreak\",\n index=index,\n data={\n \"input\": row[\"input\"],\n },\n settings={}\n )\n" }, { "lang": "python", "label": "Online Evaluation", - "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"ragas/summarization_score\",\n name=\"My Summarization Score Check\",\n data={\n \"output\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/jailbreak\",\n name=\"My Azure Jailbreak Detection Check\",\n data={\n \"input\": \"\",\n },\n settings={},\n )\n print(result)" }, { "lang": "typescript", "label": "Experiment", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"ragas/summarization_score\", {\n index,\n data: {\n output: output,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/jailbreak\", {\n index,\n data: {\n input: item.input,\n },\n });\n },\n { concurrency: 4 }\n);" }, { "lang": "typescript", "label": "Online Evaluation", - "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"ragas/summarization_score\", {\n name: \"my-evaluation\",\n data: {\n output: \"\", // your output value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/jailbreak\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } ] } - } - }, - "components": { - "schemas": { - "EvaluationResult": { - "type": "object", - "properties": { - "status": { - "type": "string", - "enum": [ - "processed", - "skipped", - "error" - ] - }, - "score": { - "type": "number", - "description": "Numeric score from the evaluation" - }, - "passed": { - "type": "boolean", - "description": "Whether the evaluation passed" - }, - "label": { - "type": "string", - "description": "Label assigned by the evaluation" - }, - "details": { - "type": "string", - "description": "Additional details about the evaluation" - }, - "cost": { - "type": "object", - "properties": { - "currency": { - "type": "string" - }, - "amount": { - "type": "number" + }, + "/azure/prompt_injection/evaluate": { + "post": { + "summary": "Azure Prompt Shield", + "description": "This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API.", + "operationId": "azure_prompt_injection_evaluate", + "requestBody": { + "content": { + "application/json": { + "schema": { + "allOf": [ + { + "$ref": "#/components/schemas/azure_prompt_injectionRequest" + }, + { + "type": "object", + "properties": { + "settings": { + "$ref": "#/components/schemas/azure_prompt_injectionSettings" + } + } + } + ] } } - } - } - }, - "azure_content_safetyRequest": { - "type": "object", - "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" - } + "required": true }, - "required": [] - }, - "azure_content_safetySettings": { - "type": "object", - "properties": { - "severity_threshold": { - "description": "The minimum severity level to consider content as unsafe, from 1 to 7.", - "type": "number", - "default": 1 + "responses": { + "200": { + "description": "Successful evaluation", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvaluationResult" + } + } + } + } }, - "categories": { - "description": "The categories of moderation to check for.", - "type": "object", - "default": { - "Hate": true, - "SelfHarm": true, - "Sexual": true, - "Violence": true + "400": { + "description": "Bad request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "detail": { + "type": "string" + } + } + } + } } }, - "output_type": { - "description": "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.", - "type": "string", - "default": "FourSeverityLevels" + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "detail": { + "type": "string" + } + } + } + } + } } - } - }, - "azure_jailbreakRequest": { - "type": "object", - "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Experiment", + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"azure/prompt_injection\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"contexts\": row[\"contexts\"],\n },\n settings={}\n )\n" + }, + { + "lang": "python", + "label": "Online Evaluation", + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"azure/prompt_injection\",\n name=\"My Azure Prompt Shield Check\",\n data={\n \"input\": \"\",\n \"contexts\": \"\",\n },\n settings={},\n )\n print(result)" + }, + { + "lang": "typescript", + "label": "Experiment", + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"azure/prompt_injection\", {\n index,\n data: {\n input: item.input,\n contexts: item.contexts,\n },\n });\n },\n { concurrency: 4 }\n);" + }, + { + "lang": "typescript", + "label": "Online Evaluation", + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"azure/prompt_injection\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n contexts: \"\", // your contexts value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" } + ] + } + }, + "/presidio/pii_detection/evaluate": { + "post": { + "summary": "Presidio PII Detection", + "description": "Detects personally identifiable information in text, including phone numbers, email addresses, and\nsocial security numbers. It allows customization of the detection threshold and the specific types of PII to check.", + "operationId": "presidio_pii_detection_evaluate", + "requestBody": { + "content": { + "application/json": { + "schema": { + "allOf": [ + { + "$ref": "#/components/schemas/presidio_pii_detectionRequest" + }, + { + "type": "object", + "properties": { + "settings": { + "$ref": "#/components/schemas/presidio_pii_detectionSettings" + } + } + } + ] + } + } + }, + "required": true }, - "required": [ - "input" + "responses": { + "200": { + "description": "Successful evaluation", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EvaluationResult" + } + } + } + } + }, + "400": { + "description": "Bad request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "detail": { + "type": "string" + } + } + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "detail": { + "type": "string" + } + } + } + } + } + } + }, + "x-codeSamples": [ + { + "lang": "python", + "label": "Experiment", + "source": "import langwatch\n\ndf = langwatch.datasets.get_dataset(\"dataset-id\").to_pandas()\n\nexperiment = langwatch.experiment.init(\"my-experiment\")\n\nfor index, row in experiment.loop(df.iterrows()):\n # your execution code here\n experiment.evaluate(\n \"presidio/pii_detection\",\n index=index,\n data={\n \"input\": row[\"input\"],\n \"output\": output,\n },\n settings={}\n )\n" + }, + { + "lang": "python", + "label": "Online Evaluation", + "source": "import langwatch\n\n@langwatch.span()\ndef my_llm_step():\n ... # your existing code\n result = langwatch.evaluation.evaluate(\n \"presidio/pii_detection\",\n name=\"My Presidio PII Detection Check\",\n data={\n \"input\": \"\",\n \"output\": \"\",\n },\n settings={},\n )\n print(result)" + }, + { + "lang": "typescript", + "label": "Experiment", + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\n// Fetch dataset from LangWatch\nconst dataset = await langwatch.datasets.get(\"your-dataset-slug\");\n\nconst experiment = await langwatch.experiments.init(\"my-experiment\");\n\nawait experiment.run(\n dataset.entries.map((e) => e.entry),\n async ({ item, index }) => {\n // Run your LLM/agent\n const output = await myLLM(item.input);\n\n // Evaluate the output\n await experiment.evaluate(\"presidio/pii_detection\", {\n index,\n data: {\n input: item.input,\n output: output,\n },\n });\n },\n { concurrency: 4 }\n);" + }, + { + "lang": "typescript", + "label": "Online Evaluation", + "source": "import { LangWatch } from \"langwatch\";\n\nconst langwatch = new LangWatch();\n\nasync function myLLMStep(input: string): Promise {\n // ... your existing code\n\n // Call the evaluator\n const result = await langwatch.evaluations.evaluate(\"presidio/pii_detection\", {\n name: \"my-evaluation\",\n data: {\n input: \"\", // your input value\n output: \"\", // your output value\n },\n settings: {},\n });\n\n console.log(result);\n return result;\n}" + } ] - }, - "azure_jailbreakSettings": { + } + } + }, + "components": { + "schemas": { + "EvaluationResult": { "type": "object", - "properties": {} + "properties": { + "status": { + "type": "string", + "enum": [ + "processed", + "skipped", + "error" + ] + }, + "score": { + "type": "number", + "description": "Numeric score from the evaluation" + }, + "passed": { + "type": "boolean", + "description": "Whether the evaluation passed" + }, + "label": { + "type": "string", + "description": "Label assigned by the evaluation" + }, + "details": { + "type": "string", + "description": "Additional details about the evaluation" + }, + "cost": { + "type": "object", + "properties": { + "currency": { + "type": "string" + }, + "amount": { + "type": "number" + } + } + } + } }, - "azure_prompt_injectionRequest": { + "openai_moderationRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [] } }, "required": [ - "input" + "data" ] }, - "azure_prompt_injectionSettings": { + "openai_moderationSettings": { "type": "object", - "properties": {} + "properties": { + "model": { + "description": "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.", + "type": "string", + "default": "text-moderation-stable" + }, + "categories": { + "description": "The categories of content to check for moderation.", + "type": "object", + "default": { + "harassment": true, + "harassment_threatening": true, + "hate": true, + "hate_threatening": true, + "self_harm": true, + "self_harm_instructions": true, + "self_harm_intent": true, + "sexual": true, + "sexual_minors": true, + "violence": true, + "violence_graphic": true + } + } + } }, "langevals_basicRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_basicSettings": { "type": "object", @@ -3915,16 +3902,28 @@ "langevals_competitor_blocklistRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_competitor_blocklistSettings": { "type": "object", @@ -3943,16 +3942,28 @@ "langevals_competitor_llmRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_competitor_llmSettings": { "type": "object", @@ -3982,16 +3993,28 @@ "langevals_competitor_llm_function_callRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_competitor_llm_function_callSettings": { "type": "object", @@ -4030,18 +4053,30 @@ "langevals_exact_matchRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } + }, + "required": [ + "output", + "expected_output" + ] } }, "required": [ - "output", - "expected_output" + "data" ] }, "langevals_exact_matchSettings": { @@ -4067,22 +4102,34 @@ "langevals_llm_answer_matchRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" - }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [ + "output", + "expected_output" + ] } }, "required": [ - "output", - "expected_output" + "data" ] }, "langevals_llm_answer_matchSettings": { @@ -4108,23 +4155,35 @@ "langevals_llm_booleanRequest": { "type": "object", "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" - }, - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_llm_booleanSettings": { "type": "object", @@ -4149,23 +4208,35 @@ "langevals_llm_categoryRequest": { "type": "object", "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" - }, - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_llm_categorySettings": { "type": "object", @@ -4204,23 +4275,35 @@ "langevals_llm_scoreRequest": { "type": "object", "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" - }, - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_llm_scoreSettings": { "type": "object", @@ -4245,13 +4328,25 @@ "langevals_off_topicRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" + }, + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [ + "input" + ] } }, "required": [ - "input" + "data" ] }, "langevals_off_topicSettings": { @@ -4286,24 +4381,36 @@ "langevals_query_resolutionRequest": { "type": "object", "properties": { - "conversation": { - "type": "array", - "items": { - "type": "object", - "properties": { - "role": { - "type": "string" - }, - "content": { - "type": "string" - } + "trace_id": { + "type": "string", + "description": "Optional trace ID to associate this evaluation with a trace" + }, + "data": { + "type": "object", + "properties": { + "conversation": { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string" + }, + "content": { + "type": "string" + } + } + }, + "description": "Array of conversation turns for multi-turn evaluation" } }, - "description": "Array of conversation turns for multi-turn evaluation" + "required": [ + "conversation" + ] } }, "required": [ - "conversation" + "data" ] }, "langevals_query_resolutionSettings": { @@ -4324,16 +4431,28 @@ "langevals_similarityRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_similaritySettings": { "type": "object", @@ -4363,12 +4482,24 @@ "langevals_valid_formatRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" + }, + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } + }, + "required": [] } }, - "required": [] + "required": [ + "data" + ] }, "langevals_valid_formatSettings": { "type": "object", @@ -4383,123 +4514,201 @@ } } }, - "presidio_pii_detectionRequest": { + "ragas_bleu_scoreRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } + }, + "required": [ + "output", + "expected_output" + ] } }, - "required": [] + "required": [ + "data" + ] }, - "presidio_pii_detectionSettings": { + "ragas_bleu_scoreSettings": { + "type": "object", + "properties": {} + }, + "ragas_context_f1Request": { "type": "object", "properties": { - "entities": { - "description": "The types of PII to check for in the input.", - "type": "object", - "default": { - "credit_card": true, - "crypto": true, - "email_address": true, - "iban_code": true, - "ip_address": true, - "location": false, - "person": false, - "phone_number": true, - "medical_license": true, - "us_bank_number": false, - "us_driver_license": false, - "us_itin": false, - "us_passport": false, - "us_ssn": false, - "uk_nhs": false, - "sg_nric_fin": false, - "au_abn": false, - "au_acn": false, - "au_tfn": false, - "au_medicare": false, - "in_pan": false, - "in_aadhaar": false, - "in_vehicle_registration": false, - "in_voter": false, - "in_passport": false - } + "trace_id": { + "type": "string", + "description": "Optional trace ID to associate this evaluation with a trace" }, - "min_threshold": { - "description": "The minimum confidence required for failing the evaluation on a PII match.", - "type": "number", - "default": 0.5 + "data": { + "type": "object", + "properties": { + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The expected contexts for comparison" + } + }, + "required": [ + "contexts", + "expected_contexts" + ] + } + }, + "required": [ + "data" + ] + }, + "ragas_context_f1Settings": { + "type": "object", + "properties": { + "distance_measure": { + "type": "string", + "default": "levenshtein" } } }, - "lingua_language_detectionRequest": { + "ragas_context_precisionRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "data": { + "type": "object", + "properties": { + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The expected contexts for comparison" + } + }, + "required": [ + "contexts", + "expected_contexts" + ] } }, "required": [ - "output" + "data" ] }, - "lingua_language_detectionSettings": { + "ragas_context_precisionSettings": { "type": "object", "properties": { - "check_for": { - "description": "What should be checked", + "distance_measure": { "type": "string", - "default": "input_matches_output" - }, - "expected_language": { - "description": "The specific language that the output is expected to be", - "type": "string" - }, - "min_words": { - "description": "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.", - "type": "number", - "default": 7 - }, - "threshold": { - "description": "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.", - "type": "number", - "default": 0.25 + "default": "levenshtein" } } }, - "legacy_ragas_answer_correctnessRequest": { + "ragas_context_recallRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { + "data": { + "type": "object", + "properties": { + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The expected contexts for comparison" + } + }, + "required": [ + "contexts", + "expected_contexts" + ] + } + }, + "required": [ + "data" + ] + }, + "ragas_context_recallSettings": { + "type": "object", + "properties": { + "distance_measure": { "type": "string", - "description": "The expected output for comparison" - }, - "input": { + "default": "levenshtein" + } + } + }, + "ragas_factual_correctnessRequest": { + "type": "object", + "properties": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" + }, + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } + }, + "required": [ + "output", + "expected_output" + ] } }, "required": [ - "output", - "expected_output" + "data" ] }, - "legacy_ragas_answer_correctnessSettings": { + "ragas_factual_correctnessSettings": { "type": "object", "properties": { "model": { @@ -4507,36 +4716,65 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 + }, + "mode": { + "description": "The mode to use for the factual correctness metric.", + "type": "string", + "default": "f1" + }, + "atomicity": { + "description": "The level of atomicity for claim decomposition.", + "type": "string", + "default": "low" + }, + "coverage": { + "description": "The level of coverage for claim decomposition.", + "type": "string", + "default": "low" } } }, - "legacy_ragas_answer_relevancyRequest": { + "ragas_faithfulnessRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } + }, + "required": [ + "output", + "contexts" + ] } }, "required": [ - "input", - "output" + "data" ] }, - "legacy_ragas_answer_relevancySettings": { + "ragas_faithfulnessSettings": { "type": "object", "properties": { "model": { @@ -4544,44 +4782,59 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 + }, + "autodetect_dont_know": { + "description": "Whether to autodetect 'I don't know' in the output to avoid failing the evaluation.", + "type": "boolean", + "default": true } } }, - "legacy_ragas_context_precisionRequest": { + "ragas_response_context_precisionRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" - }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "required": [ + "input", + "contexts" + ] } }, "required": [ - "input", - "contexts", - "expected_output" + "data" ] }, - "legacy_ragas_context_precisionSettings": { + "ragas_response_context_precisionSettings": { "type": "object", "properties": { "model": { @@ -4589,11 +4842,6 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", @@ -4601,32 +4849,49 @@ } } }, - "legacy_ragas_context_recallRequest": { + "ragas_response_context_recallRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" - }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "required": [ + "input", + "output", + "contexts", + "expected_output" + ] } }, "required": [ - "input", - "contexts", - "expected_output" + "data" ] }, - "legacy_ragas_context_recallSettings": { + "ragas_response_context_recallSettings": { "type": "object", "properties": { "model": { @@ -4634,11 +4899,6 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", @@ -4646,27 +4906,36 @@ } } }, - "legacy_ragas_context_relevancyRequest": { + "ragas_response_relevancyRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [ + "input", + "output" + ] } }, "required": [ - "output", - "contexts" + "data" ] }, - "legacy_ragas_context_relevancySettings": { + "ragas_response_relevancySettings": { "type": "object", "properties": { "model": { @@ -4674,84 +4943,96 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 + }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" } } }, - "legacy_ragas_context_utilizationRequest": { + "ragas_rouge_scoreRequest": { "type": "object", "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" - }, - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [ + "output", + "expected_output" + ] } }, "required": [ - "input", - "output", - "contexts" + "data" ] }, - "legacy_ragas_context_utilizationSettings": { + "ragas_rouge_scoreSettings": { "type": "object", "properties": { - "model": { - "description": "The model to use for evaluation.", + "rouge_type": { + "description": "ROUGE type", "type": "string", - "default": "openai/gpt-5" + "default": "rouge1" }, - "embeddings_model": { - "description": "The model to use for embeddings.", + "measure_type": { + "description": "ROUGE measure type", "type": "string", - "default": "openai/text-embedding-ada-002" - }, - "max_tokens": { - "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - "type": "number", - "default": 2048 + "default": "fmeasure" } } }, - "legacy_ragas_faithfulnessRequest": { + "ragas_rubrics_based_scoringRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [ + "input", + "output" + ] } }, "required": [ - "output", - "contexts" + "data" ] }, - "legacy_ragas_faithfulnessSettings": { + "ragas_rubrics_based_scoringSettings": { "type": "object", "properties": { "model": { @@ -4759,194 +5040,267 @@ "type": "string", "default": "openai/gpt-5" }, - "embeddings_model": { - "description": "The model to use for embeddings.", - "type": "string", - "default": "openai/text-embedding-ada-002" - }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 + }, + "rubrics": { + "type": "array", + "default": [ + { + "description": "The response is incorrect, irrelevant." + }, + { + "description": "The response partially answers the question but includes significant errors, omissions, or irrelevant information." + }, + { + "description": "The response partially answers the question but includes minor errors, omissions, or irrelevant information." + }, + { + "description": "The response fully answers the question and includes minor errors, omissions, or irrelevant information." + }, + { + "description": "The response fully answers the question and includes no errors, omissions, or irrelevant information." + } + ] } } }, - "openai_moderationRequest": { + "ragas_sql_query_equivalenceRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + }, + "expected_contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The expected contexts for comparison" + } + }, + "required": [ + "output", + "expected_output", + "expected_contexts" + ] } }, - "required": [] + "required": [ + "data" + ] }, - "openai_moderationSettings": { + "ragas_sql_query_equivalenceSettings": { "type": "object", "properties": { "model": { - "description": "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.", + "description": "The model to use for evaluation.", "type": "string", - "default": "text-moderation-stable" + "default": "openai/gpt-5" }, - "categories": { - "description": "The categories of content to check for moderation.", - "type": "object", - "default": { - "harassment": true, - "harassment_threatening": true, - "hate": true, - "hate_threatening": true, - "self_harm": true, - "self_harm_instructions": true, - "self_harm_intent": true, - "sexual": true, - "sexual_minors": true, - "violence": true, - "violence_graphic": true - } + "max_tokens": { + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "type": "number", + "default": 2048 } } }, - "ragas_bleu_scoreRequest": { + "ragas_summarization_scoreRequest": { "type": "object", "properties": { - "output": { - "type": "string", - "description": "The output/response text to evaluate" - }, - "expected_output": { + "trace_id": { "type": "string", - "description": "The expected output for comparison" - } - }, - "required": [ - "output", - "expected_output" - ] - }, - "ragas_bleu_scoreSettings": { - "type": "object", - "properties": {} - }, - "ragas_context_f1Request": { - "type": "object", - "properties": { - "contexts": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of context strings used for RAG evaluation" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } }, - "description": "The expected contexts for comparison" + "required": [ + "output", + "contexts" + ] } }, "required": [ - "contexts", - "expected_contexts" + "data" ] }, - "ragas_context_f1Settings": { + "ragas_summarization_scoreSettings": { "type": "object", "properties": { - "distance_measure": { + "model": { + "description": "The model to use for evaluation.", "type": "string", - "default": "levenshtein" + "default": "openai/gpt-5" + }, + "max_tokens": { + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "type": "number", + "default": 2048 } } }, - "ragas_context_precisionRequest": { + "lingua_language_detectionRequest": { "type": "object", "properties": { - "contexts": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of context strings used for RAG evaluation" + "trace_id": { + "type": "string", + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } }, - "description": "The expected contexts for comparison" + "required": [ + "output" + ] } }, "required": [ - "contexts", - "expected_contexts" + "data" ] }, - "ragas_context_precisionSettings": { + "lingua_language_detectionSettings": { "type": "object", "properties": { - "distance_measure": { + "check_for": { + "description": "What should be checked", "type": "string", - "default": "levenshtein" + "default": "input_matches_output" + }, + "expected_language": { + "description": "The specific language that the output is expected to be", + "type": "string" + }, + "min_words": { + "description": "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.", + "type": "number", + "default": 7 + }, + "threshold": { + "description": "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.", + "type": "number", + "default": 0.25 } } }, - "ragas_context_recallRequest": { + "legacy_ragas_answer_correctnessRequest": { "type": "object", "properties": { - "contexts": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of context strings used for RAG evaluation" + "trace_id": { + "type": "string", + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + }, + "input": { + "type": "string", + "description": "The input text to evaluate" + } }, - "description": "The expected contexts for comparison" + "required": [ + "output", + "expected_output" + ] } }, "required": [ - "contexts", - "expected_contexts" + "data" ] }, - "ragas_context_recallSettings": { + "legacy_ragas_answer_correctnessSettings": { "type": "object", "properties": { - "distance_measure": { + "model": { + "description": "The model to use for evaluation.", "type": "string", - "default": "levenshtein" + "default": "openai/gpt-5" + }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" + }, + "max_tokens": { + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "type": "number", + "default": 2048 } } }, - "ragas_factual_correctnessRequest": { + "legacy_ragas_answer_relevancyRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } + }, + "required": [ + "input", + "output" + ] } }, "required": [ - "output", - "expected_output" + "data" ] }, - "ragas_factual_correctnessSettings": { + "legacy_ragas_answer_relevancySettings": { "type": "object", "properties": { "model": { @@ -4954,53 +5308,56 @@ "type": "string", "default": "openai/gpt-5" }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" + }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 - }, - "mode": { - "description": "The mode to use for the factual correctness metric.", - "type": "string", - "default": "f1" - }, - "atomicity": { - "description": "The level of atomicity for claim decomposition.", - "type": "string", - "default": "low" - }, - "coverage": { - "description": "The level of coverage for claim decomposition.", - "type": "string", - "default": "low" } } }, - "ragas_faithfulnessRequest": { + "legacy_ragas_context_precisionRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" - }, - "input": { - "type": "string", - "description": "The input text to evaluate" + "required": [ + "input", + "contexts", + "expected_output" + ] } }, "required": [ - "output", - "contexts" + "data" ] }, - "ragas_faithfulnessSettings": { + "legacy_ragas_context_precisionSettings": { "type": "object", "properties": { "model": { @@ -5008,47 +5365,56 @@ "type": "string", "default": "openai/gpt-5" }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" + }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", "default": 2048 - }, - "autodetect_dont_know": { - "description": "Whether to autodetect 'I don't know' in the output to avoid failing the evaluation.", - "type": "boolean", - "default": true } } }, - "ragas_response_context_precisionRequest": { + "legacy_ragas_context_recallRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + }, + "expected_output": { + "type": "string", + "description": "The expected output for comparison" + } }, - "description": "Array of context strings used for RAG evaluation" - }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" - }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "required": [ + "input", + "contexts", + "expected_output" + ] } }, "required": [ - "input", - "contexts" + "data" ] }, - "ragas_response_context_precisionSettings": { + "legacy_ragas_context_recallSettings": { "type": "object", "properties": { "model": { @@ -5056,6 +5422,11 @@ "type": "string", "default": "openai/gpt-5" }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" + }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", @@ -5063,37 +5434,39 @@ } } }, - "ragas_response_context_recallRequest": { + "legacy_ragas_context_relevancyRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" - }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } }, - "description": "Array of context strings used for RAG evaluation" - }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "required": [ + "output", + "contexts" + ] } }, "required": [ - "input", - "output", - "contexts", - "expected_output" + "data" ] }, - "ragas_response_context_recallSettings": { + "legacy_ragas_context_relevancySettings": { "type": "object", "properties": { "model": { @@ -5101,6 +5474,11 @@ "type": "string", "default": "openai/gpt-5" }, + "embeddings_model": { + "description": "The model to use for embeddings.", + "type": "string", + "default": "openai/text-embedding-ada-002" + }, "max_tokens": { "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", "type": "number", @@ -5108,24 +5486,44 @@ } } }, - "ragas_response_relevancyRequest": { + "legacy_ragas_context_utilizationRequest": { "type": "object", "properties": { - "input": { + "trace_id": { "type": "string", - "description": "The input text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "output": { - "type": "string", - "description": "The output/response text to evaluate" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } + }, + "required": [ + "input", + "output", + "contexts" + ] } }, "required": [ - "input", - "output" + "data" ] }, - "ragas_response_relevancySettings": { + "legacy_ragas_context_utilizationSettings": { "type": "object", "properties": { "model": { @@ -5133,178 +5531,248 @@ "type": "string", "default": "openai/gpt-5" }, - "max_tokens": { - "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - "type": "number", - "default": 2048 - }, "embeddings_model": { "description": "The model to use for embeddings.", "type": "string", "default": "openai/text-embedding-ada-002" + }, + "max_tokens": { + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "type": "number", + "default": 2048 } } }, - "ragas_rouge_scoreRequest": { + "legacy_ragas_faithfulnessRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "data": { + "type": "object", + "properties": { + "output": { + "type": "string", + "description": "The output/response text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } + }, + "required": [ + "output", + "contexts" + ] } }, "required": [ - "output", - "expected_output" + "data" ] }, - "ragas_rouge_scoreSettings": { + "legacy_ragas_faithfulnessSettings": { "type": "object", "properties": { - "rouge_type": { - "description": "ROUGE type", + "model": { + "description": "The model to use for evaluation.", "type": "string", - "default": "rouge1" + "default": "openai/gpt-5" }, - "measure_type": { - "description": "ROUGE measure type", + "embeddings_model": { + "description": "The model to use for embeddings.", "type": "string", - "default": "fmeasure" + "default": "openai/text-embedding-ada-002" + }, + "max_tokens": { + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "type": "number", + "default": 2048 } } }, - "ragas_rubrics_based_scoringRequest": { + "azure_content_safetyRequest": { "type": "object", "properties": { - "input": { - "type": "string", - "description": "The input text to evaluate" - }, - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_output": { - "type": "string", - "description": "The expected output for comparison" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } + }, + "required": [] } }, "required": [ - "input", - "output" + "data" ] }, - "ragas_rubrics_based_scoringSettings": { + "azure_content_safetySettings": { "type": "object", "properties": { - "model": { - "description": "The model to use for evaluation.", - "type": "string", - "default": "openai/gpt-5" - }, - "max_tokens": { - "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "severity_threshold": { + "description": "The minimum severity level to consider content as unsafe, from 1 to 7.", "type": "number", - "default": 2048 + "default": 1 }, - "rubrics": { - "type": "array", - "default": [ - { - "description": "The response is incorrect, irrelevant." - }, - { - "description": "The response partially answers the question but includes significant errors, omissions, or irrelevant information." - }, - { - "description": "The response partially answers the question but includes minor errors, omissions, or irrelevant information." - }, - { - "description": "The response fully answers the question and includes minor errors, omissions, or irrelevant information." - }, - { - "description": "The response fully answers the question and includes no errors, omissions, or irrelevant information." - } - ] + "categories": { + "description": "The categories of moderation to check for.", + "type": "object", + "default": { + "Hate": true, + "SelfHarm": true, + "Sexual": true, + "Violence": true + } + }, + "output_type": { + "description": "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.", + "type": "string", + "default": "FourSeverityLevels" } } }, - "ragas_sql_query_equivalenceRequest": { + "azure_jailbreakRequest": { "type": "object", "properties": { - "output": { - "type": "string", - "description": "The output/response text to evaluate" - }, - "expected_output": { + "trace_id": { "type": "string", - "description": "The expected output for comparison" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "expected_contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + } }, - "description": "The expected contexts for comparison" + "required": [ + "input" + ] } }, "required": [ - "output", - "expected_output", - "expected_contexts" + "data" ] }, - "ragas_sql_query_equivalenceSettings": { + "azure_jailbreakSettings": { + "type": "object", + "properties": {} + }, + "azure_prompt_injectionRequest": { "type": "object", "properties": { - "model": { - "description": "The model to use for evaluation.", + "trace_id": { "type": "string", - "default": "openai/gpt-5" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "max_tokens": { - "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - "type": "number", - "default": 2048 + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "contexts": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Array of context strings used for RAG evaluation" + } + }, + "required": [ + "input" + ] } - } + }, + "required": [ + "data" + ] }, - "ragas_summarization_scoreRequest": { + "azure_prompt_injectionSettings": { + "type": "object", + "properties": {} + }, + "presidio_pii_detectionRequest": { "type": "object", "properties": { - "output": { + "trace_id": { "type": "string", - "description": "The output/response text to evaluate" + "description": "Optional trace ID to associate this evaluation with a trace" }, - "contexts": { - "type": "array", - "items": { - "type": "string" + "data": { + "type": "object", + "properties": { + "input": { + "type": "string", + "description": "The input text to evaluate" + }, + "output": { + "type": "string", + "description": "The output/response text to evaluate" + } }, - "description": "Array of context strings used for RAG evaluation" + "required": [] } }, "required": [ - "output", - "contexts" + "data" ] }, - "ragas_summarization_scoreSettings": { + "presidio_pii_detectionSettings": { "type": "object", "properties": { - "model": { - "description": "The model to use for evaluation.", - "type": "string", - "default": "openai/gpt-5" + "entities": { + "description": "The types of PII to check for in the input.", + "type": "object", + "default": { + "credit_card": true, + "crypto": true, + "email_address": true, + "iban_code": true, + "ip_address": true, + "location": false, + "person": false, + "phone_number": true, + "medical_license": true, + "us_bank_number": false, + "us_driver_license": false, + "us_itin": false, + "us_passport": false, + "us_ssn": false, + "uk_nhs": false, + "sg_nric_fin": false, + "au_abn": false, + "au_acn": false, + "au_tfn": false, + "au_medicare": false, + "in_pan": false, + "in_aadhaar": false, + "in_vehicle_registration": false, + "in_voter": false, + "in_passport": false + } }, - "max_tokens": { - "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "min_threshold": { + "description": "The minimum confidence required for failing the evaluation on a PII match.", "type": "number", - "default": 2048 + "default": 0.5 } } }