forked from confident-ai/deepeval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
b.py
95 lines (82 loc) · 2.54 KB
/
b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from typing import List
from deepeval.metrics.base_metric import BaseMetric
from deepeval.metrics.utils import check_llm_test_case_params
from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval import confident_evaluate, evaluate
from deepeval.metrics import (
AnswerRelevancyMetric,
BiasMetric,
FaithfulnessMetric,
ConversationCompletenessMetric,
)
from deepeval.test_case.llm_test_case import LLMTestCaseParams
test_case = ConversationalTestCase(
turns=[
LLMTestCase(
input="Message input", actual_output="Message actual output"
)
]
)
test_case2 = ConversationalTestCase(
turns=[
LLMTestCase(
input="Message input", actual_output="Message actual output"
)
]
)
required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.RETRIEVAL_CONTEXT,
]
class FakeMetric(BaseMetric):
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase, _show_indicator: bool):
check_llm_test_case_params(test_case, required_params, self)
self.score = 1
self.success = self.score >= self.threshold
self.reason = "This metric looking good!"
return self.score
async def a_measure(self, test_case: LLMTestCase, _show_indicator: bool):
check_llm_test_case_params(test_case, required_params, self)
self.score = 1
self.success = self.score >= self.threshold
self.reason = "This metric looking good!"
return self.score
def is_successful(self):
return self.success
@property
def __name__(self):
return "Fake"
evaluate(
test_cases=[
LLMTestCase(
input="Message input", actual_output="Message actual output"
),
LLMTestCase(
input="Message input 2",
actual_output="Message actual output 2",
retrieval_context=[""],
),
],
metrics=[FakeMetric(), FaithfulnessMetric()],
skip_on_missing_params=True,
ignore_errors=True,
)
# confident_evaluate(experiment_name="Convo", test_cases=[test_case])
# evaluate(
# test_cases=[
# LLMTestCase(
# input="Message input", actual_output="Message actual output"
# )
# ],
# metrics=[
# AnswerRelevancyMetric(),
# BiasMetric(),
# FaithfulnessMetric(),
# ConversationCompletenessMetric(),
# ],
# run_async=True,
# ignore_errors=True,
# )