diff --git a/litellm/__init__.py b/litellm/__init__.py index 0bb9ac27232..69f80123a08 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -1628,6 +1628,14 @@ def __getattr__(name: str) -> Any: return _lazy_import_llm_configs(name) + # Lazy load encoding from main.py to avoid heavy tiktoken import + if name == "encoding": + from .main import encoding as _encoding + # Cache it in the module's __dict__ for subsequent accesses + import sys + sys.modules[__name__].__dict__["encoding"] = _encoding + return _encoding + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/litellm/main.py b/litellm/main.py index 99788291db2..b46208cc432 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -238,7 +238,6 @@ all_litellm_params, ) -encoding = tiktoken.get_encoding("cl100k_base") from litellm.types.utils import ModelResponseStream from litellm.utils import ( Choices, @@ -1511,7 +1510,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, # pass AsyncOpenAI, OpenAI client custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) @@ -1734,7 +1733,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -1813,7 +1812,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, headers=headers, @@ -1861,7 +1860,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, # pass AsyncOpenAI, OpenAI client custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) except Exception as e: @@ -1991,7 +1990,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2021,7 +2020,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2052,7 +2051,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2082,7 +2081,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2134,7 +2133,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider=custom_llm_provider, timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -2162,7 +2161,7 @@ def completion( # type: ignore # noqa: PLR0915 shared_session=shared_session, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, api_base=api_base, stream=stream, @@ -2202,7 +2201,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) elif custom_llm_provider == "cometapi": @@ -2236,7 +2235,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2321,7 +2320,7 @@ def completion( # type: ignore # noqa: PLR0915 api_base=api_base, custom_llm_provider=custom_llm_provider, model_response=model_response, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, timeout=timeout, @@ -2389,7 +2388,7 @@ def completion( # type: ignore # noqa: PLR0915 api_base=api_base, custom_llm_provider=custom_llm_provider, model_response=model_response, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, timeout=timeout, @@ -2434,7 +2433,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, # for calculating input/output tokens + encoding=_get_encoding(), # for calculating input/output tokens api_key=replicate_key, logging_obj=logging, custom_prompt_dict=custom_prompt_dict, @@ -2499,7 +2498,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="anthropic_text", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements ) @@ -2545,7 +2544,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, # for calculating input/output tokens + encoding=_get_encoding(), # for calculating input/output tokens api_key=api_key, logging_obj=logging, headers=headers, @@ -2585,7 +2584,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_key=nlp_cloud_key, logging_obj=logging, ) @@ -2633,7 +2632,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), default_max_tokens_to_sample=litellm.max_tokens, api_key=aleph_alpha_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements @@ -2701,7 +2700,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="cohere_chat", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=cohere_key, provider_config=provider_config, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements @@ -2730,7 +2729,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_key=maritalk_key, logging_obj=logging, custom_llm_provider="maritalk", @@ -2760,7 +2759,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, timeout=timeout, @@ -2790,7 +2789,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) elif custom_llm_provider == "oci": @@ -2808,7 +2807,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) elif custom_llm_provider == "compactifai": @@ -2833,7 +2832,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2849,7 +2848,7 @@ def completion( # type: ignore # noqa: PLR0915 litellm_params=litellm_params, api_key=None, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, ) if "stream" in optional_params and optional_params["stream"] is True: @@ -2893,7 +2892,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="databricks", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -2932,7 +2931,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=provider_config, ) @@ -2994,7 +2993,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="openrouter", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3057,7 +3056,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="vercel_ai_gateway", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3115,7 +3114,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, vertex_credentials=vertex_credentials, @@ -3164,7 +3163,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_base=api_base, vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, @@ -3185,7 +3184,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, vertex_credentials=vertex_credentials, @@ -3208,7 +3207,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_base=api_base, vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, @@ -3230,7 +3229,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), api_base=api_base, vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, @@ -3262,7 +3261,7 @@ def completion( # type: ignore # noqa: PLR0915 model_response=model_response, optional_params=new_params, litellm_params=litellm_params, # type: ignore - encoding=encoding, + encoding=_get_encoding(), api_key=None, api_base=api_base, logging_obj=logging, @@ -3282,7 +3281,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=new_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, vertex_credentials=vertex_credentials, @@ -3339,7 +3338,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, acompletion=acompletion, api_base=api_base, @@ -3379,7 +3378,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, acompletion=acompletion, api_base=api_base, @@ -3409,7 +3408,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="sagemaker_chat", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3429,7 +3428,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_prompt_dict=custom_prompt_dict, hf_model_name=hf_model_name, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, acompletion=acompletion, ) @@ -3473,7 +3472,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, # type: ignore logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, extra_headers=headers, # Use merged headers instead of original extra_headers timeout=timeout, @@ -3496,7 +3495,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="bedrock", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3514,7 +3513,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="bedrock", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, client=client, @@ -3536,7 +3535,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore custom_prompt_dict=custom_prompt_dict, client=client, # pass AsyncOpenAI, OpenAI client - encoding=encoding, + encoding=_get_encoding(), custom_llm_provider="watsonx", ) elif custom_llm_provider == "watsonx_text": @@ -3598,7 +3597,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="watsonx_text", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3614,7 +3613,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, ) @@ -3655,7 +3654,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="ollama", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3691,7 +3690,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="ollama_chat", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements client=client, @@ -3712,7 +3711,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider=custom_llm_provider, timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, ) @@ -3745,7 +3744,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="cloudflare", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements ) @@ -3764,7 +3763,7 @@ def completion( # type: ignore # noqa: PLR0915 optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, client=client, ) @@ -3799,7 +3798,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, ) @@ -3827,7 +3826,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider="gradient_ai", timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, ) @@ -3854,7 +3853,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=bytez_transformation, ) @@ -3882,7 +3881,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=lemonade_transformation, ) @@ -3918,7 +3917,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore client=client, custom_llm_provider=custom_llm_provider, - encoding=encoding, + encoding=_get_encoding(), stream=stream, provider_config=ovhcloud_transformation, ) @@ -4024,7 +4023,7 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore custom_prompt_dict=custom_prompt_dict, client=client, # pass AsyncOpenAI, OpenAI client - encoding=encoding, + encoding=_get_encoding(), ) if stream is True: return CustomStreamWrapper( @@ -4061,7 +4060,7 @@ def completion( # type: ignore # noqa: PLR0915 custom_llm_provider=custom_llm_provider, timeout=timeout, headers=headers, - encoding=encoding, + encoding=_get_encoding(), api_key=api_key, logging_obj=logging, client=client, @@ -4629,7 +4628,7 @@ def embedding( # noqa: PLR0915 response = huggingface_embed.embedding( model=model, input=input, - encoding=encoding, # type: ignore + encoding=_get_encoding(), # type: ignore api_key=api_key, api_base=api_base, logging_obj=logging, @@ -4647,7 +4646,7 @@ def embedding( # noqa: PLR0915 response = bedrock_embedding.embeddings( model=model, input=transformed_input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, model_response=EmbeddingResponse(), @@ -4687,7 +4686,7 @@ def embedding( # noqa: PLR0915 response = google_batch_embeddings.batch_embeddings( # type: ignore model=model, input=input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, model_response=EmbeddingResponse(), @@ -4742,7 +4741,7 @@ def embedding( # noqa: PLR0915 response = vertex_multimodal_embedding.multimodal_embedding( model=model, input=input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, litellm_params=litellm_params_dict, @@ -4760,7 +4759,7 @@ def embedding( # noqa: PLR0915 response = vertex_embedding.embedding( model=model, input=input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, model_response=EmbeddingResponse(), @@ -4779,7 +4778,7 @@ def embedding( # noqa: PLR0915 response = oobabooga.embedding( model=model, input=input, - encoding=encoding, + encoding=_get_encoding(), api_base=api_base, logging_obj=logging, optional_params=optional_params, @@ -4811,7 +4810,7 @@ def embedding( # noqa: PLR0915 api_base=api_base, model=model, prompts=input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, model_response=EmbeddingResponse(), @@ -4820,7 +4819,7 @@ def embedding( # noqa: PLR0915 response = sagemaker_llm.embedding( model=model, input=input, - encoding=encoding, + encoding=_get_encoding(), logging_obj=logging, optional_params=optional_params, model_response=EmbeddingResponse(), @@ -6891,3 +6890,31 @@ def stream_chunk_builder( # noqa: PLR0915 llm_provider="", model="", ) + + +# Cache for encoding to avoid repeated __getattr__ calls +_encoding_cache: Optional[Any] = None + + +def _get_encoding(): + """Get encoding, loading it lazily if needed.""" + global _encoding_cache + if _encoding_cache is None: + import sys + # Access via module to trigger __getattr__ if not cached + _encoding_cache = sys.modules[__name__].encoding + return _encoding_cache + + +def __getattr__(name: str) -> Any: + """Lazy import handler for main module""" + if name == "encoding": + # Lazy load encoding to avoid heavy tiktoken import at module load time + _encoding = tiktoken.get_encoding("cl100k_base") + # Cache it in the module's __dict__ for subsequent accesses + import sys + sys.modules[__name__].__dict__["encoding"] = _encoding + global _encoding_cache + _encoding_cache = _encoding + return _encoding + raise AttributeError(f"module {__name__!r} has no attribute {name!r}")