patchy631 · sitamgithub-MSIT · Apr 19, 2025 · Apr 21, 2025 · coderabbitai · Apr 19, 2025
diff --git a/llm-as-judge/.env.example b/llm-as-judge/.env.example
@@ -0,0 +1 @@
+ATLA_API_KEY=your_atla_api_key
diff --git a/llm-as-judge/README.md b/llm-as-judge/README.md
@@ -0,0 +1 @@
+# LLM as a Judge
diff --git a/llm-as-judge/main.py b/llm-as-judge/main.py
@@ -0,0 +1,120 @@
+import os
+import json
+from typing import Any
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from openai import OpenAI
+from opik.evaluation.metrics import base_metric, score_result
+
+
+# Take environment variables from .env
+load_dotenv()
+
+# Configure Comet's Opik
+import opik
+opik.configure(use_local=False)
+
+
+class LLMJudgeResult(BaseModel):
+    """Pydantic model representing the result of LLM judge evaluation."""
+
+    score: int
+    reason: str
+
+
+class LLMJudgeMetric(base_metric.BaseMetric):
+    """
+    A metric that uses an LLM to judge the quality of AI-generated responses.
+
+    Attributes:
+        - name (str): The name of the metric.
+        - model_name (str): The name of the LLM model to use for evaluation.
+        - llm_client (OpenAI): The client for communicating with the LLM.
+        - prompt_template (str): The template for constructing prompts to send to the LLM.
+
+    Methods:
+        score(input, output, **ignored_kwargs): Evaluates the AI-generated response against the
+            provided input using the LLM as a judge.
+
+    Returns:
+        ScoreResult: Contains the numerical score (1-5) and reasoning for the evaluation.
+    """
+
+    def __init__(self, name: str = "LLM judge metric", model_name: str = "atla-selene"):
+        # Initialize the metric with a name and model name
+        self.name = name
+        self.model_name = model_name
+
+        # Check if API key is available
+        api_key = os.environ.get("ATLA_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "ATLA_API_KEY environment variable not found. Please check your .env file."
+            )
+
+        # Initialize the OpenAI client with the API key and base URL for Atla
+        self.llm_client = OpenAI(
+            api_key=api_key,
+            base_url="https://api.atla-ai.com/v1",
+        )
+
+        # Define the prompt template for the LLM
+        self.prompt_template = """
+        You are an expert, impartial judge tasked with evaluating an AI-generated response based on a given instruction and scoring rubric.
+        Provide comprehensive feedback on the response, strictly adhering to the scoring rubric. Follow this with a score between 1 and 5.
+
+        The format of the your response should be a json with no backticks that returns:
+        {{
+            "score": <score between 1 and 5>,
+            "reason": "<reason for the score>"
+        }}
+
+        Scoring Rubric:
+        Does the response effectively use humor or wit to enhance the conversation?
+        Score 1: The response is devoid of any humor or wit.
+        Score 2: The response attempts humor, but it falls flat or is inappropriate.
+        Score 3: The response includes humor or wit, but it could be more effectively integrated.
+        Score 4: The response uses humor or wit effectively in most instances, enhancing the conversation.
+        Score 5: The response perfectly integrates humor or wit, greatly enhancing the enjoyment of the conversation.
+
+        Here is the data to evaluate:
+        Instruction: {input}
+        Response: {output}
+        """
+
+    @opik.track
+    def score(self, input: str, output: str, **ignored_kwargs: Any):
+        """Method to evaluate the AI-generated response using the LLM judge."""
+        # Apply prompt template and prepare the messages for the LLM
+        prompt = self.prompt_template.format(input=input, output=output)
+        messages = [{"role": "user", "content": prompt}]
+
+        # Call the LLM with the prepared messages
+        try:
+            response = self.llm_client.chat.completions.create(
+                model=self.model_name, messages=messages
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error calling LLM API: {str(e)}") from e
+
+        # Parse the response from the LLM
+        response_content = (
+            response.choices[0].message.content
+            if hasattr(response, "choices")
+            else response.message.content
+        )
+        result_json = json.loads(response_content)
+
-        result_json = json.loads(response_content)
+        try:
+            result_json = json.loads(response_content)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Failed to parse LLM response as JSON: {response_content}") from e
+
+        # Validate response structure
+        if "score" not in result_json or "reason" not in result_json:
+            raise ValueError(f"LLM response missing required fields: {result_json}")
-        result_json = json.loads(response_content)
+        try:
+            result_json = json.loads(response_content)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Failed to parse LLM response as JSON: {response_content}") from e
+
+        # Validate response structure
+        if "score" not in result_json or "reason" not in result_json:
+            raise ValueError(f"LLM response missing required fields: {result_json}")
+        # Return the result as a ScoreResult object with the score and reason
+        return score_result.ScoreResult(
+            name=self.name, value=result_json["score"], reason=result_json["reason"]
+        )
+
+
+# Example usage
+if __name__ == "__main__":
+    metric = LLMJudgeMetric()
+    metric.score(
+        input="Tell me a joke.",
+        output="Why did the chicken cross the road? To get to the other side",
+    )
diff --git a/llm-as-judge/pyproject.toml b/llm-as-judge/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "llm-as-judge"
+version = "0.1.0"
+description = "Integration of Atla LLM Judge with Comet's Opik."
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "openai>=1.75.0",
+    "opik>=1.7.7",
+    "python-dotenv>=1.1.0",
+]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# LLM as a Judge
Copy link coderabbitai bot Apr 19, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Expand the README with project details and instructions The README currently only contains a title. To onboard users effectively, please add essential sections such as: A brief Description of what the package does and its use case Installation instructions (via pip/poetry) Environment Variables setup (e.g., `.env` parameters) Usage examples showing how to invoke the metric Contribution guidelines and License information Example snippet: # LLM as a Judge + +## Description +This package provides an `LLMJudgeMetric` that leverages Atla’s open-source LLM to evaluate AI-generated responses on humor and wit, integrated with Comet’s Opik for telemetry. + +## Installation +```bash +pip install llm-as-judge +``` + +## Environment Variables +Create a `.env` file with: +```dotenv +ATLA_API_KEY=your_api_key_here +OPIK_API_KEY=your_opik_key_here +``` + +## Usage +```python +from llm_as_judge.main import LLMJudgeMetric + +metric = LLMJudgeMetric() +score, detail = metric.evaluate(response="Your AI response", prompt="Your prompt") +print(score, detail) +``` + +## Contributing +1. Fork the repo +2. Create a feature branch +3. Submit a pull request + +## License +MIT © Your Name