> ## Documentation Index
> Fetch the complete documentation index at: https://langwatch.ai/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# LLM-as-a-Judge Score Evaluator

> Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message.



## OpenAPI

````yaml post /langevals/llm_score/evaluate
openapi: 3.1.0
info:
  title: LangEvals API
  version: 1.0.0
  description: API for LangEvals evaluators
servers:
  - url: https://app.langwatch.ai/api/evaluations
    description: Production server
security:
  - api_key: []
paths:
  /langevals/llm_score/evaluate:
    post:
      summary: LLM-as-a-Judge Score Evaluator
      description: >-
        Use an LLM as a judge with custom prompt to do a numeric score
        evaluation of the message.
      operationId: langevals_llm_score_evaluate
      requestBody:
        content:
          application/json:
            schema:
              allOf:
                - $ref: '#/components/schemas/langevals_llm_scoreRequest'
                - type: object
                  properties:
                    settings:
                      $ref: '#/components/schemas/langevals_llm_scoreSettings'
        required: true
      responses:
        '200':
          description: Successful evaluation
          content:
            application/json:
              schema:
                type: array
                items:
                  $ref: '#/components/schemas/EvaluationResult'
        '400':
          description: Bad request
          content:
            application/json:
              schema:
                type: object
                properties:
                  detail:
                    type: string
        '500':
          description: Internal server error
          content:
            application/json:
              schema:
                type: object
                properties:
                  detail:
                    type: string
      x-codeSamples:
        - lang: python
          label: Experiment
          source: |
            import langwatch

            df = langwatch.datasets.get_dataset("dataset-id").to_pandas()

            experiment = langwatch.experiment.init("my-experiment")

            for index, row in experiment.loop(df.iterrows()):
                # your execution code here
                experiment.evaluate(
                    "langevals/llm_score",
                    index=index,
                    data={
                        "input": row["input"],
                        "output": output,
                        "contexts": row["contexts"],
                    },
                    settings={}
                )
        - lang: python
          label: Online Evaluation
          source: |-
            import langwatch

            @langwatch.span()
            def my_llm_step():
                ... # your existing code
                result = langwatch.evaluation.evaluate(
                    "langevals/llm_score",
                    name="My LLM-as-a-Judge Score Evaluator Check",
                    data={
                        "input": "",
                        "output": "",
                        "contexts": "",
                    },
                    settings={},
                )
                print(result)
        - lang: typescript
          label: Experiment
          source: >-
            import { LangWatch } from "langwatch";


            const langwatch = new LangWatch();


            // Fetch dataset from LangWatch

            const dataset = await langwatch.datasets.get("your-dataset-slug");


            const experiment = await
            langwatch.experiments.init("my-experiment");


            await experiment.run(
              dataset.entries.map((e) => e.entry),
              async ({ item, index }) => {
                // Run your LLM/agent
                const output = await myLLM(item.input);

                // Evaluate the output
                await experiment.evaluate("langevals/llm_score", {
                  index,
                  data: {
                    input: item.input,
                    output: output,
                    contexts: item.contexts,
                  },
                });
              },
              { concurrency: 4 }
            );
        - lang: typescript
          label: Online Evaluation
          source: |-
            import { LangWatch } from "langwatch";

            const langwatch = new LangWatch();

            async function myLLMStep(input: string): Promise<string> {
              // ... your existing code

              // Call the evaluator
              const result = await langwatch.evaluations.evaluate("langevals/llm_score", {
                name: "my-evaluation",
                data: {
                  input: "", // your input value
                  output: "", // your output value
                  contexts: "", // your contexts value
                },
                settings: {},
              });

              console.log(result);
              return result;
            }
components:
  schemas:
    langevals_llm_scoreRequest:
      type: object
      properties:
        trace_id:
          type: string
          description: Optional trace ID to associate this evaluation with a trace
        data:
          type: object
          properties:
            input:
              type: string
              description: The input text to evaluate
            output:
              type: string
              description: The output/response text to evaluate
            contexts:
              type: array
              items:
                type: string
              description: Array of context strings used for RAG evaluation
          required: []
      required:
        - data
    langevals_llm_scoreSettings:
      type: object
      properties:
        model:
          description: The model to use for evaluation
          type: string
          default: openai/gpt-5
        max_tokens:
          description: Max tokens allowed for evaluation
          type: number
          default: 128000
        prompt:
          description: The system prompt to use for the LLM to run the evaluation
          type: string
          default: >-
            You are an LLM evaluator. Please score from 0.0 to 1.0 how likely
            the user is to be satisfied with this answer, from 0.0 being not
            satisfied at all to 1.0 being completely satisfied
    EvaluationResult:
      type: object
      properties:
        status:
          type: string
          enum:
            - processed
            - skipped
            - error
        score:
          type: number
          description: Numeric score from the evaluation
        passed:
          type: boolean
          description: Whether the evaluation passed
        label:
          type: string
          description: Label assigned by the evaluation
        details:
          type: string
          description: Additional details about the evaluation
        cost:
          type: object
          properties:
            currency:
              type: string
            amount:
              type: number
  securitySchemes:
    api_key:
      type: apiKey
      in: header
      name: X-Auth-Token
      description: API key for authentication

````