@@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine):
404404 df .columns = ["foo" , "bar" ]
405405 check_round_trip (df , engine )
406406
407- def test_columns_dtypes_invalid (self , engine ):
408- df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
409-
410- msg = "parquet must have string column names"
411- # numeric
412- df .columns = [0 , 1 ]
413- self .check_error_on_write (df , engine , ValueError , msg )
414-
415- # bytes
416- df .columns = [b"foo" , b"bar" ]
417- self .check_error_on_write (df , engine , ValueError , msg )
418-
419- # python object
420- df .columns = [
421- datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
422- datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
423- ]
424- self .check_error_on_write (df , engine , ValueError , msg )
425-
426407 @pytest .mark .parametrize ("compression" , [None , "gzip" , "snappy" , "brotli" ])
427408 def test_compression (self , engine , compression ):
428409 if compression == "snappy" :
@@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine):
528509 # Not able to write column multi-indexes with non-string column names.
529510 mi_columns = pd .MultiIndex .from_tuples ([("a" , 1 ), ("a" , 2 ), ("b" , 1 )])
530511 df = pd .DataFrame (np .random .randn (4 , 3 ), columns = mi_columns )
531- msg = (
532- r"\s*parquet must have string column names for all values in\s*"
533- "each level of the MultiIndex"
534- )
535- self .check_error_on_write (df , engine , ValueError , msg )
536512
537- def test_write_column_multiindex_nonstring (self , pa ):
513+ if engine == "fastparquet" :
514+ self .check_error_on_write (
515+ df , engine , TypeError , "Column name must be a string"
516+ )
517+ elif engine == "pyarrow" :
518+ check_round_trip (df , engine )
519+
520+ def test_write_column_multiindex_nonstring (self , engine ):
538521 # GH #34777
539- # Not supported in fastparquet as of 0.1.3
540- engine = pa
541522
542523 # Not able to write column multi-indexes with non-string column names
543524 arrays = [
@@ -546,11 +527,14 @@ def test_write_column_multiindex_nonstring(self, pa):
546527 ]
547528 df = pd .DataFrame (np .random .randn (8 , 8 ), columns = arrays )
548529 df .columns .names = ["Level1" , "Level2" ]
549- msg = (
550- r"\s*parquet must have string column names for all values in\s*"
551- "each level of the MultiIndex"
552- )
553- self .check_error_on_write (df , engine , ValueError , msg )
530+ if engine == "fastparquet" :
531+ if Version (fastparquet .__version__ ) < Version ("0.7.0" ):
532+ err = TypeError
533+ else :
534+ err = ValueError
535+ self .check_error_on_write (df , engine , err , "Column name" )
536+ elif engine == "pyarrow" :
537+ check_round_trip (df , engine )
554538
555539 def test_write_column_multiindex_string (self , pa ):
556540 # GH #34777
@@ -579,17 +563,19 @@ def test_write_column_index_string(self, pa):
579563
580564 check_round_trip (df , engine )
581565
582- def test_write_column_index_nonstring (self , pa ):
566+ def test_write_column_index_nonstring (self , engine ):
583567 # GH #34777
584- # Not supported in fastparquet as of 0.1.3
585- engine = pa
586568
587569 # Write column indexes with string column names
588570 arrays = [1 , 2 , 3 , 4 ]
589571 df = pd .DataFrame (np .random .randn (8 , 4 ), columns = arrays )
590572 df .columns .name = "NonStringCol"
591- msg = r"parquet must have string column names"
592- self .check_error_on_write (df , engine , ValueError , msg )
573+ if engine == "fastparquet" :
574+ self .check_error_on_write (
575+ df , engine , TypeError , "Column name must be a string"
576+ )
577+ else :
578+ check_round_trip (df , engine )
593579
594580 @pytest .mark .skipif (pa_version_under7p0 , reason = "minimum pyarrow not installed" )
595581 def test_dtype_backend (self , engine , request ):
@@ -1041,6 +1027,31 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa):
10411027 expected = expected ,
10421028 )
10431029
1030+ def test_columns_dtypes_not_invalid (self , pa ):
1031+ df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
1032+
1033+ # numeric
1034+ df .columns = [0 , 1 ]
1035+ check_round_trip (df , pa )
1036+
1037+ # bytes
1038+ df .columns = [b"foo" , b"bar" ]
1039+ with pytest .raises (NotImplementedError , match = "|S3" ):
1040+ # Bytes fails on read_parquet
1041+ check_round_trip (df , pa )
1042+
1043+ # python object
1044+ df .columns = [
1045+ datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
1046+ datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
1047+ ]
1048+ check_round_trip (df , pa )
1049+
1050+ def test_empty_columns (self , pa ):
1051+ # GH 52034
1052+ df = pd .DataFrame (index = pd .Index (["a" , "b" , "c" ], name = "custom name" ))
1053+ check_round_trip (df , pa )
1054+
10441055
10451056class TestParquetFastParquet (Base ):
10461057 def test_basic (self , fp , df_full ):
@@ -1052,6 +1063,27 @@ def test_basic(self, fp, df_full):
10521063 df ["timedelta" ] = pd .timedelta_range ("1 day" , periods = 3 )
10531064 check_round_trip (df , fp )
10541065
1066+ def test_columns_dtypes_invalid (self , fp ):
1067+ df = pd .DataFrame ({"string" : list ("abc" ), "int" : list (range (1 , 4 ))})
1068+
1069+ err = TypeError
1070+ msg = "Column name must be a string"
1071+
1072+ # numeric
1073+ df .columns = [0 , 1 ]
1074+ self .check_error_on_write (df , fp , err , msg )
1075+
1076+ # bytes
1077+ df .columns = [b"foo" , b"bar" ]
1078+ self .check_error_on_write (df , fp , err , msg )
1079+
1080+ # python object
1081+ df .columns = [
1082+ datetime .datetime (2011 , 1 , 1 , 0 , 0 ),
1083+ datetime .datetime (2011 , 1 , 1 , 1 , 1 ),
1084+ ]
1085+ self .check_error_on_write (df , fp , err , msg )
1086+
10551087 def test_duplicate_columns (self , fp ):
10561088 # not currently able to handle duplicate columns
10571089 df = pd .DataFrame (np .arange (12 ).reshape (4 , 3 ), columns = list ("aaa" )).copy ()
@@ -1281,3 +1313,12 @@ def test_invalid_dtype_backend(self, engine):
12811313 df .to_parquet (path )
12821314 with pytest .raises (ValueError , match = msg ):
12831315 read_parquet (path , dtype_backend = "numpy" )
1316+
1317+ def test_empty_columns (self , fp ):
1318+ # GH 52034
1319+ df = pd .DataFrame (index = pd .Index (["a" , "b" , "c" ], name = "custom name" ))
1320+ expected = pd .DataFrame (
1321+ columns = pd .Index ([], dtype = object ),
1322+ index = pd .Index (["a" , "b" , "c" ], name = "custom name" ),
1323+ )
1324+ check_round_trip (df , fp , expected = expected )
0 commit comments