SanghunYun95 · SanghunYun95 · Mar 6, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/backend/app/api/routes/chat.py b/backend/app/api/routes/chat.py
@@ -128,17 +128,19 @@ async def generate_chat_events(request: Request, query: str, history: List[Histo
 
     try:
         chunk_count = 0
+        disconnected = False
         async for chunk in get_response_stream_async(context=combined_context, query=english_query, history=formatted_history):
             # If client disconnects, stop generating
             if await request.is_disconnected():
+                disconnected = True
                 break
 
             chunk_count += 1
             # Clean up chunk to avoid SSE formatting issues with newlines
             chunk_clean = chunk.replace("\n", "\\n")
             yield {"event": "content", "data": chunk_clean}
 
-        if chunk_count == 0:
+        if not disconnected and chunk_count == 0:
             logger.warning("LLM returned 0 chunks. Sending a fallback message.")
             yield {"event": "content", "data": "철학자는 난색을 표하며 서적을 뒤적거립니다. 대신 철학자가 답변을 해줄 만한 다른 질문은 없을까요?"}
 

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -5,6 +5,7 @@ class Settings(BaseSettings):
     # API Keys
     GEMINI_API_KEY: str = ""
     ALADIN_API_KEY: str = ""
+    HUGGINGFACEHUB_API_TOKEN: str = ""
 
     # Supabase Settings
     SUPABASE_URL: str = ""

diff --git a/backend/app/main.py b/backend/app/main.py
@@ -49,7 +49,7 @@ def _on_preload_done(task: asyncio.Task):
                 await asyncio.wait_for(asyncio.shield(preload_task), timeout=3.0)
             except asyncio.TimeoutError:
                 logger.warning("Preload task did not finish before shutdown.")
-            except Exception as e:
+            except Exception:
                 logger.exception("Exception occurred while waiting for preload task during shutdown.")
 
 app = FastAPI(

diff --git a/backend/app/services/embedding.py b/backend/app/services/embedding.py
@@ -1,6 +1,7 @@
 import threading
 import logging
-from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFaceEndpointEmbeddings
+from app.core.config import settings
 
 logger = logging.getLogger(__name__)
 
@@ -18,13 +19,15 @@ def embeddings(self):
         if self._embeddings is None:
             with self._lock:
                 if self._embeddings is None:
-                    logger.info("Loading local embedding model: %s (HuggingFace)...", MODEL_NAME)
-                    self._embeddings = HuggingFaceEmbeddings(
-                        model_name=MODEL_NAME,
-                        model_kwargs={'device': 'cpu'},
-                        encode_kwargs={'normalize_embeddings': True}
+                    logger.info("Using HuggingFace Inference API for embedding model: %s", MODEL_NAME)
+                    if not settings.HUGGINGFACEHUB_API_TOKEN:
+                        logger.warning("HUGGINGFACEHUB_API_TOKEN is not set. The Inference API might fail if heavily rate-limited.")
+                    self._embeddings = HuggingFaceEndpointEmbeddings(
+                        model=MODEL_NAME,
+                        task="feature-extraction",
+                        huggingfacehub_api_token=settings.HUGGINGFACEHUB_API_TOKEN
                     )
-                    logger.info("Local embedding model loaded successfully.")
+                    logger.info("HuggingFace Inference API configured successfully.")
         return self._embeddings
 
     def _validate_embedding_dimension(self, embedding: list[float]) -> None:

diff --git a/backend/app/services/llm.py b/backend/app/services/llm.py
@@ -124,15 +124,20 @@ async def get_response_stream_async(context: str, query: str, history: str = "")
     prompt = get_rag_prompt()
     chain = prompt | get_llm() | StrOutputParser()
     generator = chain.astream({"context": context, "chat_history": history, "query": query})
-    while True:
-        try:
-            chunk = await asyncio.wait_for(generator.__anext__(), timeout=30.0)
-            yield chunk
-        except StopAsyncIteration:
-            break
-        except asyncio.TimeoutError:
-            print("LLM stream chunk timed out after 30 seconds.")
-            raise
+    try:
+        while True:
+            try:
+                chunk = await asyncio.wait_for(generator.__anext__(), timeout=30.0)
+                yield chunk
+            except StopAsyncIteration:
+                break
+    except asyncio.TimeoutError:
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.warning(f"LLM stream chunk timed out after 30 seconds. Query: {query}")
-        logger.warning(f"LLM stream chunk timed out after 30 seconds. Query: {query}")
+        logger.warning("LLM stream chunk timed out after 30 seconds")
-        logger.warning(f"LLM stream chunk timed out after 30 seconds. Query: {query}")
+        logger.warning("LLM stream chunk timed out after 30 seconds")
+        raise
+    finally:
+        await generator.aclose()
 
 title_prompt = PromptTemplate.from_template(
     """주어진 질문을 기반으로 철학적인 대화방 제목을 15자 이내로 지어줘.

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -10,5 +10,6 @@ pydantic>=2.7.0
 pydantic-settings
 python-dotenv
 langchain-community==0.4.1
+langchain-huggingface>=0.1.0
 sentence-transformers>=2.2.0,<3.0.0
 slowapi>=0.1.9