[go: up one dir, main page]

Skip to content

Commit

Permalink
evaluatation
Browse files Browse the repository at this point in the history
  • Loading branch information
kritinv committed Sep 11, 2024
1 parent 304526d commit 0877b01
Show file tree
Hide file tree
Showing 9 changed files with 546 additions and 56 deletions.
241 changes: 211 additions & 30 deletions deepeval/evaluate.py

Large diffs are not rendered by default.

46 changes: 45 additions & 1 deletion deepeval/metrics/base_metric.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import abstractmethod
from typing import Optional, Dict

from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase


class BaseMetric:
Expand Down Expand Up @@ -74,3 +74,47 @@ def is_successful(self) -> bool:
@property
def __name__(self):
return "Base Conversational Metric"

class BaseMultimodalMetric:
score: Optional[float] = None
score_breakdown: Dict = None
reason: Optional[str] = None
success: Optional[bool] = None
evaluation_model: Optional[str] = None
strict_mode: bool = False
async_mode: bool = True
verbose_mode: bool = True
include_reason: bool = False
error: Optional[str] = None
evaluation_cost: Optional[float] = None
verbose_logs: Optional[str] = None

@property
def threshold(self) -> float:
return self._threshold

@threshold.setter
def threshold(self, value: float):
self._threshold = value

@abstractmethod
def measure(
self, test_case: MLLMTestCase, *args, **kwargs
) -> float:
raise NotImplementedError

@abstractmethod
async def a_measure(
self, test_case: MLLMTestCase, *args, **kwargs
) -> float:
raise NotImplementedError(
f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'."
)

@abstractmethod
def is_successful(self) -> bool:
raise NotImplementedError

@property
def __name__(self):
return "Base Multimodal Metric"
16 changes: 8 additions & 8 deletions deepeval/metrics/indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import time
import asyncio

from deepeval.metrics import BaseMetric, BaseConversationalMetric
from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval.metrics import BaseMetric, BaseConversationalMetric, BaseMultimodalMetric
from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
from deepeval.test_run.cache import CachedTestCase, Cache
from deepeval.telemetry import capture_metric_type

Expand Down Expand Up @@ -53,8 +53,8 @@ def metric_progress_indicator(
async def measure_metric_task(
task_id,
progress,
metric: Union[BaseMetric, BaseConversationalMetric],
test_case: Union[LLMTestCase, ConversationalTestCase],
metric: Union[BaseMetric, BaseConversationalMetric, BaseMultimodalMetric],
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
cached_test_case: Union[CachedTestCase, None],
ignore_errors: bool,
):
Expand Down Expand Up @@ -109,8 +109,8 @@ async def measure_metric_task(


async def measure_metrics_with_indicator(
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
test_case: Union[LLMTestCase, ConversationalTestCase],
metrics: List[Union[BaseMetric, BaseConversationalMetric, BaseMultimodalMetric]],
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
cached_test_case: Union[CachedTestCase, None],
ignore_errors: bool,
show_indicator: bool,
Expand Down Expand Up @@ -172,8 +172,8 @@ async def measure_metrics_with_indicator(


async def safe_a_measure(
metric: Union[BaseMetric, BaseConversationalMetric],
tc: Union[LLMTestCase, ConversationalTestCase],
metric: Union[BaseMetric, BaseConversationalMetric, BaseMultimodalMetric],
tc: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
ignore_errors: bool,
):
try:
Expand Down
66 changes: 62 additions & 4 deletions deepeval/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
import inspect
import json
from typing import Any, Dict, Optional, List, Union, Tuple
from deepeval.models import GPTModel, DeepEvalBaseLLM
from deepeval.models import GPTModel, DeepEvalBaseLLM, MultimodalGPTModel, DeepEvalBaseMLLM
from deepeval.models.gpt_model_schematic import SchematicGPTModel

from deepeval.metrics import BaseMetric, BaseConversationalMetric
from deepeval.metrics import BaseMetric, BaseConversationalMetric, BaseMultimodalMetric
from deepeval.test_case import (
LLMTestCase,
LLMTestCaseParams,
MLLMTestCase,
MLLMTestCaseParams,
ConversationalTestCase,
Message,
)


def copy_metrics(
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]]
) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
metrics: Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]
) -> Union[List[BaseMetric], List[BaseConversationalMetric], List[BaseMultimodalMetric]]:
copied_metrics = []
for metric in metrics:
metric_class = type(metric)
Expand Down Expand Up @@ -157,6 +159,36 @@ def check_llm_test_case_params(
raise ValueError(error_str)


def check_mllm_test_case_params(
test_case: MLLMTestCase,
test_case_params: List[MLLMTestCaseParams],
metric: BaseMetric,
):
if isinstance(test_case, MLLMTestCase) is False:
error_str = f"Unable to evaluate test cases that are not of type 'MLLMTestCase' using the '{metric.__name__}' metric."
metric.error = error_str
raise ValueError(error_str)

missing_params = []
for param in test_case_params:
if getattr(test_case, param.value) is None:
missing_params.append(f"'{param.value}'")

if missing_params:
if len(missing_params) == 1:
missing_params_str = missing_params[0]
elif len(missing_params) == 2:
missing_params_str = " and ".join(missing_params)
else:
missing_params_str = (
", ".join(missing_params[:-1]) + ", and " + missing_params[-1]
)

error_str = f"{missing_params_str} cannot be None for the '{metric.__name__}' metric"
metric.error = error_str
raise ValueError(error_str)


def trimAndLoadJson(
input_string: str, metric: Optional[BaseMetric] = None
) -> Any:
Expand Down Expand Up @@ -196,6 +228,32 @@ def initialize_model(
return GPTModel(model=model), True


def initialize_multimodal_model(
model: Optional[Union[str, DeepEvalBaseMLLM, MultimodalGPTModel]] = None,
) -> Tuple[DeepEvalBaseLLM, bool]:
"""
Returns a tuple of (initialized DeepEvalBaseMLLM, using_native_model boolean)
"""
# If model is a MultimodalGPTModel, it should be deemed as using native model
if isinstance(model, MultimodalGPTModel):
return model, True
# If model is a DeepEvalBaseMLLM but not a MultimodalGPTModel, we can not assume it is a native model
if isinstance(model, DeepEvalBaseMLLM):
return model, False
# Otherwise (the model is a string or None), we initialize a GPTModel and use as a native model
return MultimodalGPTModel(model=model), True


def print_verbose_logs(metric: str, logs: str):
print("*" * 50)
print(f"{metric} Verbose Logs")
print("*" * 50)
print("")
print(logs)
print("")
print("=" * 70)


def initialize_schematic_model(
model: Optional[Union[str, DeepEvalBaseLLM, SchematicGPTModel]] = None,
) -> Tuple[DeepEvalBaseLLM, bool]:
Expand Down
1 change: 1 addition & 0 deletions deepeval/test_case/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .llm_test_case import LLMTestCase, LLMTestCaseParams
from .conversational_test_case import ConversationalTestCase, Message
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams
36 changes: 36 additions & 0 deletions deepeval/test_case/mllm_test_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pydantic import Field
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from enum import Enum
from PIL.Image import Image as ImageType

class MLLMTestCaseParams(Enum):
INPUT_TEXT = "input_text"
ACTUAL_OUTPUT_IMAGE = "actual_output_image"
INPUT_IMAGE = "input_image"
ACTUAL_OUTPUT_TEXT = "actual_output_text"

@dataclass
class MLLMTestCase:
input_text: str
actual_output_image: ImageType
input_image: Optional[ImageType] = None
actual_output_text: Optional[str] = None
additional_metadata: Optional[Dict] = None
comments: Optional[str] = None
_dataset_rank: Optional[int] = field(default=None, repr=False)
_dataset_alias: Optional[str] = field(default=None, repr=False)
_dataset_id: Optional[str] = field(default=None, repr=False)

def __post_init__(self):
# Ensure `image_input` is None or an ImageType
if self.input_image is not None:
if not isinstance(self.input_image, ImageType):
raise TypeError("'input_image' must be None or a PIL Image")

# Ensure `actual_output_text` is None or a string
if self.actual_output_text is not None:
if not isinstance(self.actual_output_text, str):
raise TypeError(
"'actual_output_text' must be None or a string"
)
1 change: 1 addition & 0 deletions deepeval/test_run/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
global_test_run_manager,
TEMP_FILE_NAME,
LLMApiTestCase,
MLLMApiTestCase,
ConversationalApiTestCase,
TestRunManager,
)
Expand Down
49 changes: 47 additions & 2 deletions deepeval/test_run/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, ConfigDict
from typing import Optional, List, Union, Dict

from PIL.Image import Image as ImageType

class MetricData(BaseModel):
name: str
Expand Down Expand Up @@ -70,6 +70,51 @@ def update_run_duration(self, run_duration: float):
self.run_duration = run_duration


class MLLMApiTestCase(BaseModel):
name: str
input_text: str = Field(..., alias="inputText")
actual_output_image: ImageType = Field(..., alias="actualOutputImage")
input_image: Optional[ImageType] = Field(None, alias="inputImage")
actual_output_text: Optional[str] = Field(None, alias="actualOutputText")
success: Union[bool, None] = Field(None)
# make optional, not all test cases in a conversation will be evaluated
metrics_data: Union[List[MetricData], None] = Field(
None, alias="metricsData"
)
run_duration: Union[float, None] = Field(None, alias="runDuration")
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
order: Union[int, None] = Field(None)

# Allow arbitrary types
model_config = ConfigDict(arbitrary_types_allowed=True)

def update_metric_data(self, metric_data: MetricData):
if self.metrics_data is None:
self.metrics_data = [metric_data]
else:
self.metrics_data.append(metric_data)

if self.success is None:
# self.success will be None when it is a message
# in that case we will be setting success for the first time
self.success = metric_data.success
else:
if metric_data.success is False:
self.success = False

evaluationCost = metric_data.evaluation_cost
if evaluationCost is None:
return

if self.evaluation_cost is None:
self.evaluation_cost = evaluationCost
else:
self.evaluation_cost += evaluationCost

def update_run_duration(self, run_duration: float):
self.run_duration = run_duration


class ConversationalApiTestCase(BaseModel):
name: str
success: bool
Expand Down
Loading

0 comments on commit 0877b01

Please sign in to comment.