Skip to content

Can't load tokenizer for 'gpt2' #752

@monkeycc

Description

@monkeycc
from scrapegraphai.graphs import SmartScraperGraph


graph_config2 = {
    "llm": {
        "model": "ollama/llama3",
        "temperature": 0,
        "format": "json",
        "base_url": "http://localhost:11434", 
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",
        "base_url": "http://localhost:11434", 
    },
    "verbose": True,
}


smart_scraper_graph3 = SmartScraperGraph(
    prompt="Return the names, author names, ratings, and book links of all books on this page",
    source="https://book.douban.com/top250",
    config=graph_config2
)

result3 = smart_scraper_graph3.run()
print(result3)
    result3 = smart_scraper_graph3.run()
              ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\smart_scraper_graph.py", line 183, in run
    self.final_state, self.execution_info = self.graph.execute(inputs)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 281, in execute
    return self._execute_standard(initial_state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 197, in _execute_standard
    raise e
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 181, in _execute_standard
    result = current_node.execute(state)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\nodes\parse_node.py", line 83, in execute
    chunks = split_text_into_chunks(text=docs_transformed.page_content,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 28, in split_text_into_chunks
    chunks = chunk(text=text,
             ^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\semchunk\semchunk.py", line 129, in chunk
    if token_counter(split) > chunk_size:
       ^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 24, in count_tokens
    return num_tokens_calculus(text, model)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizer.py", line 30, in num_tokens_calculus
    num_tokens = num_tokens_fn(string, llm_model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizers\tokenizer_ollama.py", line 26, in num_tokens_ollama
    tokens = llm_model.get_num_tokens(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 365, in get_num_tokens
    return len(self.get_token_ids(text))
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 352, in get_token_ids
    return _get_token_ids_default_method(text)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 76, in _get_token_ids_default_method
    tokenizer = get_tokenizer()
                ^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 70, in get_tokenizer
    return GPT2TokenizerFast.from_pretrained("gpt2")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\transformers\tokenization_utils_base.py", line 2192, in from_pretrained
    raise EnvironmentError(
OSError: Can't load tokenizer for 'gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.
    source="https://book.douban.com/top250",
    config=graph_config2
)

result3 = smart_scraper_graph3.run()
print(result3)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions