Spring with AI (2): 评估答案——UnitTest引入

本文代码:https://github.com/JunTeamCom/ai-demo/tree/release-2.0
Spring with AI系列,只关注上层AI的应用程序(基于JAVA搭建),不关注底层的LLM原理、搭建等技术。

如何评估AI答案效果呢?

  1. 用AI,去评估答案和问题的相关性
  2. 用AI自己,去评估答案的真实准确性(是否符合事实)
  3. 人工比对:比如A领域的问题、却回答了B领域的答案等

1 搭建Mock Test

先不谈评估大模型的答案;先搭建基于Spring+WireMock的单元测试。
添加依赖:

<dependency>
	<groupId>org.wiremock.integrations</groupId>
	<artifactId>wiremock-spring-boot</artifactId>
	<version>3.10.0</version>
</dependency>

LLM返回的内容是不可精确预料的,而且会消耗Token;所以可以使用WireMock模拟LLM。
test/resources下,添加两个测试配置文件,来定义预设的答案:
test-openapi-response-usa.json

{
  "id": "chatcmpl-yDUbJwsur69ZLTSGiBpCUvL7QAAQ",
  "object": "chat.completion",
  "created": 1771113600,
  "model": "qwen3.5-plus",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "华盛顿",
        "refusal": null,
        "annotations": []
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 11,
    "completion_tokens": 13,
    "total_tokens": 24,
    "prompt_tokens_details": {
      "cached_tokens": 0,
      "audio_tokens": 0
    },
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "audio_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  },
  "service_tier": "default",
  "system_fingerprint": null
}

test-openapi-response-uk.json

{
  "id": "chatcmpl-yDUbJwsur69ZLTSGiBpCUvL7QAAR",
  "object": "chat.completion",
  "created": 1771113601,
  "model": "qwen3.5-plus",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "伦敦",
        "refusal": null,
        "annotations": []
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 11,
    "completion_tokens": 13,
    "total_tokens": 24,
    "prompt_tokens_details": {
      "cached_tokens": 0,
      "audio_tokens": 0
    },
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "audio_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  },
  "service_tier": "default",
  "system_fingerprint": null
}

单元测试代码:

package com.junteam.ai.demo.service;

import java.io.IOException;
import java.nio.charset.Charset;

import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.Resource;
import org.wiremock.spring.ConfigureWireMock;
import org.wiremock.spring.EnableWireMock;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder;
import com.github.tomakehurst.wiremock.client.WireMock;
import com.junteam.ai.demo.model.ChatQuestion;

/**
 *
 * @author gujun
 */
@EnableWireMock(@ConfigureWireMock(baseUrlProperties = "openai.base.url"))
@SpringBootTest(properties = "spring.ai.openai.base-url=${openai.base.url}")
public class ChatServiceMockTest {
    @Value("classpath:/test-openapi-response-usa.json")
    Resource responseResourceUSA;

    @Value("classpath:/test-openapi-response-uk.json")
    Resource responseResourceUK;

    @Autowired
    ChatClient.Builder chatClientBuilder;

    @BeforeEach
    public void setup() throws IOException{
        
    }

    public OpenAIChatServiceImplWireMockTest() {
    }

    /**
     * Test of ask method, of class OpenAIChatServiceImpl.
     * @throws IOException 
     */
    @SuppressWarnings("null")
    @Test
    public void testAsk() throws IOException {
        var cannedResponse = responseResourceUSA.getContentAsString(Charset.defaultCharset());
        var mapper = new ObjectMapper();
        var responseNode = mapper.readTree(cannedResponse);
        WireMock.stubFor(WireMock.post("/v1/chat/completions")
                .willReturn(ResponseDefinitionBuilder.okForJson(responseNode)));

        var instance = new OpenAIChatServiceImpl(chatClientBuilder);
        var chatAnswer = instance.ask(new ChatQuestion("美国的首都是哪里?"));
        Assertions.assertThat(chatAnswer).isNotNull();
        Assertions.assertThat(chatAnswer.answer()).isEqualTo("华盛顿");

        cannedResponse = responseResourceUK.getContentAsString(Charset.defaultCharset());
        responseNode = mapper.readTree(cannedResponse);
        WireMock.stubFor(WireMock.post("/v1/chat/completions")
                .willReturn(ResponseDefinitionBuilder.okForJson(responseNode)));
        chatAnswer = instance.ask(new ChatQuestion("英国的首都是哪里?"));
        Assertions.assertThat(chatAnswer).isNotNull();
        Assertions.assertThat(chatAnswer.answer()).isEqualTo("伦敦"); 
    }
}

image

2 评估答案

2.1 相关性评估(Relevancy)

package com.junteam.ai.demo.service;

import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.evaluation.RelevancyEvaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import com.junteam.ai.demo.model.ChatAnswer;
import com.junteam.ai.demo.model.ChatQuestion;

/**
 *
 * @author gujun
 */
@SpringBootTest
public class ChatServiceTest {
    @Autowired
    private ChatService chatService;

    @Autowired
    private ChatClient.Builder chatClientBuilder;

    private RelevancyEvaluator relevancyEvaluator;

    @BeforeEach
    public void setup() {
        this.relevancyEvaluator = new RelevancyEvaluator(chatClientBuilder);
    }

    @Test
    public void evaluateRelevancy() {
        String userText = "Why the sky is blue?";
        ChatQuestion chatQuestion = new ChatQuestion(userText);
        System.out.println("=== Chat Debug Start ===");
        ChatAnswer chatAnswer = chatService.ask(chatQuestion);
        System.out.println("=== Chat Debug Info ===");
        System.out.println("Question: " + userText);
        System.out.println("Answer: " + chatAnswer.answer());
        EvaluationRequest evaluationRequest = new EvaluationRequest(userText, chatAnswer.answer());
        System.out.println("=== Evaluator Debug Start ===");
        EvaluationResponse evaluationResponse = relevancyEvaluator.evaluate(evaluationRequest);
        // 添加调试打印
        System.out.println("=== Evaluator Debug Info ===");
        System.out.println("Score: " + evaluationResponse.getScore()); // 获取相关度
        System.out.println("Feedback: " + evaluationResponse.getFeedback()); // 获取相关度说明
        System.out.println("Raw Response: " + evaluationResponse.toString());
        System.out.println("============================");
        Assertions.assertThat(evaluationResponse.isPass())
                .withFailMessage("""
                        ========================================
                        The answer "%s"
                        is not considered relevant to the question
                        "%s".
                        ========================================
                        """, chatAnswer.answer(), userText)
                .isTrue();
    }
}

这个测试用例没有通过。
日志如下:

========================================
The answer "The short answer is **Rayleigh scattering**.

Here is a step-by-step breakdown of why this happens:

**1. Sunlight looks white, but it isn't**
Sunlight appears white to us, but it is actually made up of all the colors of the rainbow (red, orange, yellow, green, blue, indigo, and violet). You can see this when sunlight passes through a prism or water droplets to create a rainbow.

**2. Light travels in waves**
Each color of light travels in a wave of a different size (wavelength).
*   **Red light** has longer, lazier waves.
*   **Blue and violet light** have shorter, choppier waves.

**3. The atmosphere is full of obstacles**
Earth's atmosphere is filled with gas molecules, primarily nitrogen and oxygen. These molecules are smaller than the wavelength of visible light.

**4. Scattering occurs**
When sunlight passes through the atmosphere, the longer waves (reds and yellows) pass through the gas molecules relatively easily. However, the shorter waves (blues and violets) hit the gas molecules and **scatter** in every direction. This is known as *Rayleigh scattering*.

**5. What we see**
When you look up at the sky, your eyes catch this scattered blue light coming from all directions. Because blue is scattered more strongly than any other color, the sky looks blue to us.

***

**Two common follow-up questions:**

*   **Why isn't the sky violet?**
Violet light actually scatters even more than blue light. However, the sky isn't violet for two reasons:
1.  The sun emits much less violet light than blue light.
2.  Human eyes are much more sensitive to blue light than violet light.

*   **Why are sunsets red?**
When the sun is setting, it is lower on the horizon. The light has to travel through much more atmosphere to reach your eyes than it does at noon. By the time the light arrives, most of the blue light has been scattered away completely, leaving only the longer wavelengths (reds and oranges) to pass through to your eyes."
is not considered relevant to the question
"Why is the sky blue?".
========================================

at com.junteam.ai.demo.service.ChatServiceTest.evaluateRelevancy(ChatServiceTest.java:55)
at java.base/java.lang.reflect.Method.invoke(Method.java:565)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1604)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1604)


Results:

Failures:
ChatServiceTest.evaluateRelevancy:55 ========================================
The answer "The short answer is **Rayleigh scattering**.

Here is a step-by-step breakdown of why this happens:

**1. Sunlight looks white, but it isn't**
Sunlight appears white to us, but it is actually made up of all the colors of the rainbow (red, orange, yellow, green, blue, indigo, and violet). You can see this when sunlight passes through a prism or water droplets to create a rainbow.

**2. Light travels in waves**
Each color of light travels in a wave of a different size (wavelength).
*   **Red light** has longer, lazier waves.
*   **Blue and violet light** have shorter, choppier waves.

**3. The atmosphere is full of obstacles**
Earth's atmosphere is filled with gas molecules, primarily nitrogen and oxygen. These molecules are smaller than the wavelength of visible light.

**4. Scattering occurs**
When sunlight passes through the atmosphere, the longer waves (reds and yellows) pass through the gas molecules relatively easily. However, the shorter waves (blues and violets) hit the gas molecules and **scatter** in every direction. This is known as *Rayleigh scattering*.

**5. What we see**
When you look up at the sky, your eyes catch this scattered blue light coming from all directions. Because blue is scattered more strongly than any other color, the sky looks blue to us.

***

**Two common follow-up questions:**

*   **Why isn't the sky violet?**
Violet light actually scatters even more than blue light. However, the sky isn't violet for two reasons:
1.  The sun emits much less violet light than blue light.
2.  Human eyes are much more sensitive to blue light than violet light.

*   **Why are sunsets red?**
When the sun is setting, it is lower on the horizon. The light has to travel through much more atmosphere to reach your eyes than it does at noon. By the time the light arrives, most of the blue light has been scattered away completely, leaving only the longer wavelengths (reds and oranges) to pass through to your eyes."
is not considered relevant to the question
"Why is the sky blue?".
========================================


Tests run: 1, Failures: 1, Errors: 0, Skipped: 0

回答内容在人类看来是完美相关的,但你的自动化评估器(Evaluator)认为它不相关。
可能的原因:
评估器使用的 Prompt 可能要求答案必须包含特定的句式(例如必须以 "The sky is blue because..." 开头),而AI的答案是以 "The short answer is..." 开头。
需要注意的是,评估是很消耗资源的、速度会相当慢。
再重新修改问题:

美国的首都哪里?

也没有通过。

美国的首都哪里?

通过测试。

2.2 正确性评估(Factual Accuracy)

需要注意的是,正确性评估、千问默认的API没有完整实现;
需要自己写一个评估方法。

package com.junteam.ai.demo.service;

import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.evaluation.FactCheckingEvaluator;
import org.springframework.ai.chat.evaluation.RelevancyEvaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.junteam.ai.demo.model.ChatAnswer;
import com.junteam.ai.demo.model.ChatQuestion;

/**
 *
 * @author gujun
 */
@SpringBootTest
public class ChatServiceTest {
    @Autowired
    private ChatService chatService;

    @Autowired
    private ChatClient.Builder chatClientBuilder;

    private RelevancyEvaluator relevancyEvaluator;

    private FactCheckingEvaluator factCheckingEvaluator;

    @BeforeEach
    public void setup() {
        this.relevancyEvaluator = new RelevancyEvaluator(chatClientBuilder);
        this.factCheckingEvaluator = FactCheckingEvaluator.builder(chatClientBuilder).build();
    }

    @Test
    public void evaluateRelevancy() {
        String userText = "美国的首都是哪里?";
        ChatQuestion chatQuestion = new ChatQuestion(userText);
        System.out.println("=== Chat Debug Start ===");
        ChatAnswer chatAnswer = chatService.ask(chatQuestion);
        System.out.println("=== Chat Debug Info ===");
        System.out.println("Question: " + userText);
        System.out.println("Answer: " + chatAnswer.answer());

        EvaluationRequest evaluationRequest = new EvaluationRequest(userText, chatAnswer.answer());
        // 相关性评估:
        var response = relevancyEvaluator.evaluate(evaluationRequest);

        System.out.println("=== Evaluator Debug Info ===");
        System.out.println("Score: " + response.getScore()); // 获取相关度
        System.out.println("Feedback: " + response.getFeedback()); // 获取相关度说明
        System.out.println("Raw Response: " + response.toString());
        System.out.println("============================");

        Assertions.assertThat(response.isPass())
                .withFailMessage("""
                        ========================================
                        The answer "%s"
                        is not considered relevant to the question
                        "%s".
                        ========================================
                        """, chatAnswer.answer(), userText)
                .isTrue();
    }

    @SuppressWarnings({"null", "CallToPrintStackTrace"})
    private EvaluationResponse factCheckingEvaluateWithQwen(EvaluationRequest evaluationRequest) {
        var client = chatClientBuilder.build();

        // 构造显式的中文 Prompt,强制要求 JSON 输出
        String prompt = String.format("""
                你是一个事实核查助手。
                问题:%s
                回答:%s

                请判断上述回答是否符合客观事实。
                请仅返回一个 JSON 对象,不要包含任何其他文字。格式如下:
                {"pass": true/false, "score": 1.0或0.0, "feedback": "简短的理由"}
                """, evaluationRequest.getUserText(), evaluationRequest.getResponseContent());
        var mapper = new ObjectMapper();
        try {
            String content = client.prompt(prompt).call().content();
            System.out.println("Custom Evaluator Raw Response: " + content);

            // 简单解析 JSON (实际项目中建议用 Jackson ObjectMapper)
            var responseNode = mapper.readTree(content);
            return new EvaluationResponse(
                    responseNode.get("pass").asBoolean(),
                    (float) responseNode.get("score").asDouble(),
                    responseNode.get("feedback").asText(),
                    null
            );
        } catch (JsonProcessingException e) {
            e.printStackTrace();
            return new EvaluationResponse(false, 0.0f, e.getMessage(), null);
        }
    }

    @Test
    public void evaluateFactualAccuracy() {
        String userText = "美国首都是哪里?";
        ChatQuestion chatQuestion = new ChatQuestion(userText);
        System.out.println("=== Chat Debug Start ===");
        ChatAnswer chatAnswer = chatService.ask(chatQuestion);
        System.out.println("=== Chat Debug Info ===");
        System.out.println("Question: " + userText);
        System.out.println("Answer: " + chatAnswer.answer());

        var answer0 = "华盛顿特区";
        EvaluationRequest evaluationRequest0 = new EvaluationRequest(userText, answer0);

        System.out.println("=== Evaluator Debug Start ===");
        // 实时正确性:
        var response0 = factCheckingEvaluator.evaluate(evaluationRequest0);

        // 添加调试打印
        System.out.println("=== Evaluator0 Debug Info ===");
        System.out.println("Score: " + response0.getScore()); // 获取相关度
        System.out.println("Feedback: " + response0.getFeedback()); // 获取相关度说明
        System.out.println("Raw Response: " + response0.toString());

        EvaluationRequest evaluationRequest = new EvaluationRequest(userText, chatAnswer.answer());
        // 事实准确性评估:
        var response = factCheckingEvaluator.evaluate(evaluationRequest);

        System.out.println("=== Evaluator Debug Info ===");
        System.out.println("Score: " + response.getScore()); // 获取相关度
        System.out.println("Feedback: " + response.getFeedback()); // 获取相关度说明
        System.out.println("Raw Response: " + response.toString());
        System.out.println("============================");

        var response1 = factCheckingEvaluateWithQwen(evaluationRequest);

        System.out.println("=== Evaluator1 Debug Info ===");
        System.out.println("Score: " + response1.getScore()); // 获取相关度
        System.out.println("Feedback: " + response1.getFeedback()); // 获取相关度说明
        System.out.println("Raw Response: " + response1.toString());
        System.out.println("============================");

        Assertions.assertThat(response1.isPass())
                .withFailMessage("""
                        ========================================
                        The answer "%s"
                        is not considered factually accurate to the question
                        "%s".
                        ========================================
                        """, chatAnswer.answer(), userText)
                .isTrue();

        // Assertions.assertThat(response0.isPass())
        //         .withFailMessage("""
        //                 ========================================
        //                 The answer "%s"
        //                 is not considered correct to the question
        //                 "%s".
        //                 ========================================
        //                 """, answer0, userText)
        //         .isTrue();

        // Assertions.assertThat(response.isPass())
        //         .withFailMessage("""
        //                 ========================================
        //                 The answer "%s"
        //                 is not considered correct to the question
        //                 "%s".
        //                 ========================================
        //                 """, chatAnswer.answer(), userText)
        //         .isTrue();
    }
}

3 生成时纠正

可以通过评估相关性,然后进行重新生成;本文不再展开赘述。
可以参考代码:

  @Override
  @Retryable(retryFor = AnswerNotRelevantException.class)  
  public Answer askQuestion(Question question) {
    var answerText = chatClient.prompt()
        .user(question.question())
        .call()
        .content();

    evaluateRelevancy(question, answerText);

    return new Answer(answerText);
  }
  
    private void evaluateRelevancy(Question question, String answerText) {
    var evaluationRequest =
        new EvaluationRequest(question.question(), answerText);
    var evaluationResponse = evaluator.evaluate(evaluationRequest);
    if (!evaluationResponse.isPass()) {
      throw new AnswerNotRelevantException(question.question(), answerText); 
    }
  }
  
public class AnswerNotRelevantException extends RuntimeException {
    public AnswerNotRelevantException(String question, String answer) {
        super("The answer '" + answer + "' is not relevant to the question '" + question + "'.");
    }
}
posted @ 2026-03-16 22:32  gujunge  阅读(11)  评论(0)    收藏  举报