langgenius · jiangbo721 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/api/core/model_runtime/entities/llm_entities.py b/api/core/model_runtime/entities/llm_entities.py
@@ -47,6 +47,7 @@ class LLMUsage(ModelUsage):
     total_price: Decimal
     currency: str
     latency: float
+    ttft: float
 
     @classmethod
     def empty_usage(cls):
@@ -63,6 +64,7 @@ def empty_usage(cls):
             total_price=Decimal("0.0"),
             currency="USD",
             latency=0.0,
+            ttft=0.0,
         )
 
     def plus(self, other: "LLMUsage") -> "LLMUsage":
@@ -88,6 +90,7 @@ def plus(self, other: "LLMUsage") -> "LLMUsage":
                 total_price=self.total_price + other.total_price,
                 currency=other.currency,
                 latency=self.latency + other.latency,
+                ttft=self.ttft,
             )
 
     def __add__(self, other: "LLMUsage") -> "LLMUsage":

diff --git a/api/core/model_runtime/model_providers/__base/ai_model.py b/api/core/model_runtime/model_providers/__base/ai_model.py
@@ -40,7 +40,8 @@ class AIModel(BaseModel):
     provider_name: str = Field(description="Provider")
     plugin_model_provider: PluginModelProviderEntity = Field(description="Plugin model provider")
     started_at: float = Field(description="Invoke start time", default=0)
-
+    last_chunked_at: float = Field(description="Last chunk time", default=0)
+    ttft: float = Field(description="Time to first token", default=0)
     # pydantic configs
     model_config = ConfigDict(protected_namespaces=())
 

diff --git a/api/core/model_runtime/model_providers/__base/large_language_model.py b/api/core/model_runtime/model_providers/__base/large_language_model.py
@@ -65,6 +65,7 @@ def invoke(
             model_parameters = {}
 
         self.started_at = time.perf_counter()
+        self.last_chunked_at = self.started_at
 
         callbacks = callbacks or []
 
@@ -234,7 +235,13 @@ def _invoke_result_generator(
         real_model = model
 
         try:
+            is_first_chunk = True
             for chunk in result:
+                if is_first_chunk:
+                    now = time.perf_counter()
+                    self.ttft = now - self.last_chunked_at
+                    is_first_chunk = False
+
                 yield chunk
 
                 self._trigger_new_chunk_callbacks(
@@ -253,6 +260,7 @@ def _invoke_result_generator(
                 assistant_message.content += chunk.delta.message.content
                 real_model = chunk.model
                 if chunk.delta.usage:
+                    chunk.delta.usage.ttft = self.ttft
                     usage = chunk.delta.usage
 
                 if chunk.system_fingerprint:
@@ -347,6 +355,7 @@ def _calc_response_usage(
             total_price=prompt_price_info.total_amount + completion_price_info.total_amount,
             currency=prompt_price_info.currency,
             latency=time.perf_counter() - self.started_at,
+            ttft=self.ttft,
         )
 
         return usage

diff --git a/api/tests/integration_tests/model_runtime/__mock/plugin_model.py b/api/tests/integration_tests/model_runtime/__mock/plugin_model.py
@@ -227,6 +227,7 @@ def mocked_chat_create_stream(
                             total_price=Decimal(0.0003),
                             currency="USD",
                             latency=0.001,
+                            ttft=0,
                         ),
                     ),
                 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -227,6 +227,7 @@ def mocked_chat_create_stream( @@
                                 total_price=Decimal(0.0003),
                                 currency="USD",
                                 latency=0.001,
+                                ttft=0,
                             ),
                         ),
                     )
@@ Expand Down @@