3333 validate_hf_hub_args ,
3434)
3535from .utils import tqdm as hf_tqdm
36+ from .utils ._runtime import is_xet_available
3637
3738
3839if TYPE_CHECKING :
@@ -353,7 +354,7 @@ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
353354
354355
355356@validate_hf_hub_args
356- def _upload_lfs_files (
357+ def _upload_files (
357358 * ,
358359 additions : List [CommitOperationAdd ],
359360 repo_type : str ,
@@ -362,6 +363,86 @@ def _upload_lfs_files(
362363 endpoint : Optional [str ] = None ,
363364 num_threads : int = 5 ,
364365 revision : Optional [str ] = None ,
366+ create_pr : Optional [bool ] = None ,
367+ ):
368+ """
369+ Negotiates per-file transfer (LFS vs Xet) and uploads in batches.
370+ """
371+ xet_additions : List [CommitOperationAdd ] = []
372+ lfs_actions : List [Dict ] = []
373+ lfs_oid2addop : Dict [str , CommitOperationAdd ] = {}
374+
375+ for chunk in chunk_iterable (additions , chunk_size = UPLOAD_BATCH_MAX_NUM_FILES ):
376+ chunk_list = [op for op in chunk ]
377+
378+ transfers : List [str ] = ["basic" , "multipart" ]
379+ has_buffered_io_data = any (isinstance (op .path_or_fileobj , io .BufferedIOBase ) for op in chunk_list )
380+ if is_xet_available ():
381+ if not has_buffered_io_data :
382+ transfers .append ("xet" )
383+ else :
384+ logger .warning (
385+ "Uploading files as a binary IO buffer is not supported by Xet Storage. "
386+ "Falling back to HTTP upload."
387+ )
388+
389+ actions_chunk , errors_chunk , chosen_transfer = post_lfs_batch_info (
390+ upload_infos = [op .upload_info for op in chunk_list ],
391+ repo_id = repo_id ,
392+ repo_type = repo_type ,
393+ revision = revision ,
394+ endpoint = endpoint ,
395+ headers = headers ,
396+ token = None , # already passed in 'headers'
397+ transfers = transfers ,
398+ )
399+ if errors_chunk :
400+ message = "\n " .join (
401+ [
402+ f"Encountered error for file with OID { err .get ('oid' )} : `{ err .get ('error' , {}).get ('message' )} "
403+ for err in errors_chunk
404+ ]
405+ )
406+ raise ValueError (f"LFS batch API returned errors:\n { message } " )
407+
408+ # If server returns a transfer we didn't offer (e.g "xet" while uploading from BytesIO),
409+ # fall back to LFS for this chunk.
410+ if chosen_transfer == "xet" and ("xet" in transfers ):
411+ xet_additions .extend (chunk_list )
412+ else :
413+ lfs_actions .extend (actions_chunk )
414+ for op in chunk_list :
415+ lfs_oid2addop [op .upload_info .sha256 .hex ()] = op
416+
417+ if len (lfs_actions ) > 0 :
418+ _upload_lfs_files (
419+ actions = lfs_actions ,
420+ oid2addop = lfs_oid2addop ,
421+ headers = headers ,
422+ endpoint = endpoint ,
423+ num_threads = num_threads ,
424+ )
425+
426+ if len (xet_additions ) > 0 :
427+ _upload_xet_files (
428+ additions = xet_additions ,
429+ repo_type = repo_type ,
430+ repo_id = repo_id ,
431+ headers = headers ,
432+ endpoint = endpoint ,
433+ revision = revision ,
434+ create_pr = create_pr ,
435+ )
436+
437+
438+ @validate_hf_hub_args
439+ def _upload_lfs_files (
440+ * ,
441+ actions : List [Dict ],
442+ oid2addop : Dict [str , CommitOperationAdd ],
443+ headers : Dict [str , str ],
444+ endpoint : Optional [str ] = None ,
445+ num_threads : int = 5 ,
365446):
366447 """
367448 Uploads the content of `additions` to the Hub using the large file storage protocol.
@@ -370,9 +451,21 @@ def _upload_lfs_files(
370451 - LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
371452
372453 Args:
373- additions (`List` of `CommitOperationAdd`):
374- The files to be uploaded
375- repo_type (`str`):
454+ actions (`List[Dict]`):
455+ LFS batch actions returned by the server.
456+ oid2addop (`Dict[str, CommitOperationAdd]`):
457+ A dictionary mapping the OID of the file to the corresponding `CommitOperationAdd` object.
458+ headers (`Dict[str, str]`):
459+ Headers to use for the request, including authorization headers and user agent.
460+ endpoint (`str`, *optional*):
461+ The endpoint to use for the request. Defaults to `constants.ENDPOINT`.
462+ num_threads (`int`, *optional*):
463+ The number of concurrent threads to use when uploading. Defaults to 5.
464+
465+ Raises:
466+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
467+ If an upload failed for any reason
468+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
376469 Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
377470 repo_id (`str`):
378471 A namespace (user or an organization) and a repo name separated
@@ -392,50 +485,17 @@ def _upload_lfs_files(
392485 [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
393486 If the LFS batch endpoint returned an HTTP error.
394487 """
395- # Step 1: retrieve upload instructions from the LFS batch endpoint.
396- # Upload instructions are retrieved by chunk of 256 files to avoid reaching
397- # the payload limit.
398- batch_actions : List [Dict ] = []
399- for chunk in chunk_iterable (additions , chunk_size = UPLOAD_BATCH_MAX_NUM_FILES ):
400- batch_actions_chunk , batch_errors_chunk = post_lfs_batch_info (
401- upload_infos = [op .upload_info for op in chunk ],
402- repo_id = repo_id ,
403- repo_type = repo_type ,
404- revision = revision ,
405- endpoint = endpoint ,
406- headers = headers ,
407- token = None , # already passed in 'headers'
408- )
409-
410- # If at least 1 error, we do not retrieve information for other chunks
411- if batch_errors_chunk :
412- message = "\n " .join (
413- [
414- f"Encountered error for file with OID { err .get ('oid' )} : `{ err .get ('error' , {}).get ('message' )} "
415- for err in batch_errors_chunk
416- ]
417- )
418- raise ValueError (f"LFS batch endpoint returned errors:\n { message } " )
419-
420- batch_actions += batch_actions_chunk
421- oid2addop = {add_op .upload_info .sha256 .hex (): add_op for add_op in additions }
422-
423- # Step 2: ignore files that have already been uploaded
488+ # Filter out files already present upstream
424489 filtered_actions = []
425- for action in batch_actions :
490+ for action in actions :
426491 if action .get ("actions" ) is None :
427492 logger .debug (
428- f"Content of file { oid2addop [action ['oid' ]].path_in_repo } is already"
429- " present upstream - skipping upload."
493+ f"Content of file { oid2addop [action ['oid' ]].path_in_repo } is already present upstream - skipping upload."
430494 )
431495 else :
432496 filtered_actions .append (action )
433497
434- if len (filtered_actions ) == 0 :
435- logger .debug ("No LFS files to upload." )
436- return
437-
438- # Step 3: upload files concurrently according to these instructions
498+ # Upload according to server-provided actions
439499 def _wrapped_lfs_upload (batch_action ) -> None :
440500 try :
441501 operation = oid2addop [batch_action ["oid" ]]
@@ -576,30 +636,30 @@ def token_refresher() -> Tuple[str, int]:
576636 progress , progress_callback = None , None
577637
578638 try :
579- for i , chunk in enumerate ( chunk_iterable ( additions , chunk_size = UPLOAD_BATCH_MAX_NUM_FILES )):
580- _chunk = [op for op in chunk ]
581-
582- bytes_ops = [ op for op in _chunk if isinstance ( op . path_or_fileobj , bytes )]
583- paths_ops = [op for op in _chunk if isinstance ( op . path_or_fileobj , ( str , Path )) ]
584-
585- if len ( paths_ops ) > 0 :
586- upload_files (
587- [ str ( op . path_or_fileobj ) for op in paths_ops ] ,
588- xet_endpoint ,
589- access_token_info ,
590- token_refresher ,
591- progress_callback ,
592- repo_type ,
593- )
594- if len ( bytes_ops ) > 0 :
595- upload_bytes (
596- [ op . path_or_fileobj for op in bytes_ops ] ,
597- xet_endpoint ,
598- access_token_info ,
599- token_refresher ,
600- progress_callback ,
601- repo_type ,
602- )
639+ all_bytes_ops = [ op for op in additions if isinstance ( op . path_or_fileobj , bytes )]
640+ all_paths_ops = [op for op in additions if isinstance ( op . path_or_fileobj , ( str , Path )) ]
641+
642+ if len ( all_paths_ops ) > 0 :
643+ all_paths = [str ( op . path_or_fileobj ) for op in all_paths_ops ]
644+ upload_files (
645+ all_paths ,
646+ xet_endpoint ,
647+ access_token_info ,
648+ token_refresher ,
649+ progress_callback ,
650+ repo_type ,
651+ )
652+
653+ if len ( all_bytes_ops ) > 0 :
654+ all_bytes = [ op . path_or_fileobj for op in all_bytes_ops ]
655+ upload_bytes (
656+ all_bytes ,
657+ xet_endpoint ,
658+ access_token_info ,
659+ token_refresher ,
660+ progress_callback ,
661+ repo_type ,
662+ )
603663
604664 finally :
605665 if progress is not None :
0 commit comments