Ошибка OSError: Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted. Помогите исправить

Question

!pip install torch sentencepiece
accelerate==0.18.0
bitsandbytes==0.37.2
git+https://github.com/huggingface/transformers.git@15641892985b1d77acc74c9065c332cd7c3f7d7f
git+https://github.com/huggingface/peft.git@c22a57420cc539b547beb7e40cd0712c9f56910a

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "IlyaGusev/saiga_7b_lora"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
config = PeftConfig.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(
    model,
    MODEL_NAME,
    torch_dtype=torch.float16
)
model.eval()

После этого выдает ошибку:

    OSError                                   Traceback (most recent call last)
    File ~\AppData\Roaming\Python\Python311\site-packages\transformers\tokenization_utils_base.py:1965, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
       1964 try:
    -> 1965     tokenizer = cls(*init_inputs, **init_kwargs)
       1966 except OSError:
    
   

     File ~\AppData\Roaming\Python\Python311\site-packages\transformers\models\llama\tokenization_llama.py:96, in LlamaTokenizer.__init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, **kwargs)
             95 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        ---> 96 self.sp_model.Load(vocab_file)
        
        File ~\AppData\Roaming\Python\Python311\site-packages\sentencepiece\__init__.py:961, in SentencePieceProcessor.Load(self, model_file, model_proto)
            960   return self.LoadFromSerializedProto(model_proto)
        --> 961 return self.LoadFromFile(model_file)
        
        File ~\AppData\Roaming\Python\Python311\site-packages\sentencepiece\__init__.py:316, in SentencePieceProcessor.LoadFromFile(self, arg)
            315 def LoadFromFile(self, arg):
        --> 316     return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
        
        OSError: Not found: "C:\Users\Нейронка/.cache\huggingface\hub\models--IlyaGusev--saiga_7b_lora\snapshots\ea333322229f2a5d16dd06b5cc1b882dbc06124f\tokenizer.model": No such file or directory Error #2
        
        During handling of the above exception, another exception occurred:
        
        OSError                                   Traceback (most recent call last)
        Cell In[39], line 7
              3 from transformers import AutoModelForCausalLM, AutoTokenizer
              5 MODEL_NAME = "IlyaGusev/saiga_7b_lora"
        ----> 7 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
              8 config = PeftConfig.from_pretrained(MODEL_NAME)
              9 model = AutoModelForCausalLM.from_pretrained(
             10     config.base_model_name_or_path,
             11     load_in_8bit=True,
             12     device_map="auto",
             13     torch_dtype=torch.float16
             14 )
        
        File ~\AppData\Roaming\Python\Python311\site-packages\transformers\models\auto\tokenization_auto.py:694, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
            690     if tokenizer_class is None:
            691         raise ValueError(
            692             f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
            693         )
        --> 694     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            696 # Otherwise we have to be creative.
            697 # if model is an encoder decoder, the encoder tokenizer class is used by default
            698 if isinstance(config, EncoderDecoderConfig):
        
        File ~\AppData\Roaming\Python\Python311\site-packages\transformers\tokenization_utils_base.py:1811, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
           1808     else:
           1809         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
        -> 1811 return cls._from_pretrained(
           1812     resolved_vocab_files,
           1813     pretrained_model_name_or_path,
           1814     init_configuration,
           1815     *init_inputs,
           1816     use_auth_token=use_auth_token,
           1817     cache_dir=cache_dir,
           1818     local_files_only=local_files_only,
           1819     _commit_hash=commit_hash,
           1820     **kwargs,
           1821 )
        
        File ~\AppData\Roaming\Python\Python311\site-packages\transformers\tokenization_utils_base.py:1967, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
           1965     tokenizer = cls(*init_inputs, **init_kwargs)
           1966 except OSError:
        -> 1967     raise OSError(
           1968         "Unable to load vocabulary from file. "
           1969         "Please check that the provided vocabulary is accessible and not corrupted."
           1970     )
           1972 # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
           1973 # Removed: Now done at the base class level
           1974 # tokenizer.init_inputs = init_inputs
           1975 # tokenizer.init_kwargs = init_kwargs
           1976 
           1977 # If there is a complementary special token map, load it
           1978 special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        
        OSError: Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted.

БЛОГ НА HUSL

Ошибка OSError: Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted. Помогите исправить

Ответы (0 шт):