@@ -1475,19 +1475,27 @@ def test_summarization(self):
1475
1475
)
1476
1476
1477
1477
expected_summaries = [
1478
+ "<pad> "
1478
1479
'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
1479
1480
" cell phone video of the final seconds . \" one can hear cries of 'My God' in several languages,\" one"
1480
- " magazine says ." ,
1481
+ " magazine says ."
1482
+ "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>" ,
1483
+ "<pad> "
1481
1484
"the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
1482
1485
" preliminary examination into the situation in the occupied Palestinian territory . as members of the"
1483
- " court, Palestinians may be subject to counter-charges as well ." ,
1486
+ " court, Palestinians may be subject to counter-charges as well ."
1487
+ "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>" ,
1488
+ "<pad> "
1484
1489
"the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
1485
1490
" the debate that has already begun since the announcement of the new framework will likely result in more"
1486
1491
" heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
1487
- " implement a rigorous inspection regime ." ,
1492
+ " implement a rigorous inspection regime ."
1493
+ "</s>" ,
1494
+ "<pad> "
1488
1495
"prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
1489
1496
' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
1490
- " times, with nine of her marriages occurring between 1999 and 2002 ." ,
1497
+ " times, with nine of her marriages occurring between 1999 and 2002 ."
1498
+ "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>" ,
1491
1499
]
1492
1500
1493
1501
use_task_specific_params (model , "summarization" )
@@ -1512,11 +1520,8 @@ def test_summarization(self):
1512
1520
early_stopping = True ,
1513
1521
)
1514
1522
1515
- decoded = tok .batch_decode (hypotheses_batch , skip_special_tokens = True , clean_up_tokenization_spaces = False )
1516
- self .assertListEqual (
1517
- expected_summaries ,
1518
- decoded ,
1519
- )
1523
+ decoded = tok .batch_decode (hypotheses_batch )
1524
+ self .assertListEqual (expected_summaries , decoded )
1520
1525
1521
1526
@slow
1522
1527
def test_translation_en_to_de (self ):
@@ -1526,13 +1531,13 @@ def test_translation_en_to_de(self):
1526
1531
1527
1532
en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
1528
1533
expected_translation = (
1529
- '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
1534
+ '<pad> "Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.</s> '
1530
1535
)
1531
1536
1532
1537
input_ids = tok .encode (model .config .prefix + en_text , return_tensors = "pt" )
1533
1538
input_ids = input_ids .to (torch_device )
1534
1539
output = model .generate (input_ids )
1535
- translation = tok .decode (output [0 ], skip_special_tokens = True , clean_up_tokenization_spaces = False )
1540
+ translation = tok .decode (output [0 ])
1536
1541
self .assertEqual (translation , expected_translation )
1537
1542
1538
1543
@slow
@@ -1558,13 +1563,15 @@ def test_translation_en_to_fr(self):
1558
1563
do_sample = False ,
1559
1564
early_stopping = True ,
1560
1565
)
1561
- translation = tok .decode (output [0 ], skip_special_tokens = True , clean_up_tokenization_spaces = False )
1566
+ translation = tok .decode (output [0 ])
1562
1567
new_truncated_translation = (
1568
+ "<pad> "
1563
1569
"Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
1564
1570
"un "
1565
1571
"« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
1566
1572
"sous forme "
1567
1573
"de points bleus."
1574
+ "</s>"
1568
1575
)
1569
1576
1570
1577
self .assertEqual (translation , new_truncated_translation )
@@ -1575,11 +1582,13 @@ def test_translation_en_to_ro(self):
1575
1582
tok = self .tokenizer
1576
1583
use_task_specific_params (model , "translation_en_to_ro" )
1577
1584
en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
1578
- expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
1585
+ expected_translation = (
1586
+ "<pad> Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022.</s>"
1587
+ )
1579
1588
1580
1589
inputs = tok (model .config .prefix + en_text , return_tensors = "pt" ).to (torch_device )
1581
1590
output = model .generate (** inputs )
1582
- translation = tok .decode (output [0 ], skip_special_tokens = True , clean_up_tokenization_spaces = False )
1591
+ translation = tok .decode (output [0 ])
1583
1592
self .assertEqual (translation , expected_translation )
1584
1593
1585
1594
@slow
0 commit comments