2
2
import json
3
3
import logging
4
4
import os
5
+ from datetime import datetime
5
6
from functools import partial
6
7
from pathlib import Path
7
8
from typing import Any , Callable , Optional , Sequence , Type , Union
8
9
9
10
import numpy as np
10
11
import pandas as pd
11
12
13
+ import alpaca_eval
14
+
12
15
from .. import completion_parsers , constants , processors , types , utils
13
16
from ..decoders import get_fn_completions
14
17
@@ -50,10 +53,12 @@ class BaseAnnotator(abc.ABC):
50
53
Keys use to distinguish the example.
51
54
52
55
other_output_keys_to_keep : sequence of str, optional
53
- Other output columns to store besides the annotations.
56
+ Other output columns to store besides the annotations. You can use `{annotation_key}` to refer to the name
57
+ of the annotation column.
54
58
55
59
other_input_keys_to_keep : sequence of str, optional
56
- Other columns to keep from the input dataframe besides the primary keys.
60
+ Other columns to keep from the input dataframe besides the primary keys. You can use `{annotation_key}` to refer
61
+ to the name of the annotation column.
57
62
58
63
is_store_missing_annotations : bool, optional
59
64
Whether to store missing annotations. If True it avoids trying to reannotate examples that have errors.
@@ -90,16 +95,19 @@ def __init__(
90
95
seed : Optional [int ] = 0 ,
91
96
is_avoid_reannotations : bool = True ,
92
97
other_output_keys_to_keep : Sequence [str ] = (
93
- "price_per_example" ,
94
- "time_per_example" ,
95
- "raw_completion" ,
98
+ "{annotation_key}_price_per_example" ,
99
+ "{annotation_key}_time_per_example" ,
100
+ "{annotation_key}_version" ,
101
+ "{annotation_key}_date" ,
102
+ "{annotation_key}_raw_completion" ,
96
103
),
97
104
other_input_keys_to_keep : Sequence [str ] = (),
98
105
is_store_missing_annotations : bool = True ,
99
106
base_dir : Optional [Union [types .AnyPath , Sequence [types .AnyPath ]]] = None ,
100
107
is_raise_if_missing_primary_keys : bool = True ,
101
108
annotation_type : Optional [Type ] = None ,
102
109
is_reapply_parsing : bool = False ,
110
+ ** single_annotator_kwargs ,
103
111
):
104
112
logging .info (f"Creating the annotator from `{ annotators_config } `." )
105
113
base_dir = base_dir or self .DEFAULT_BASE_DIR
@@ -123,9 +131,11 @@ def __init__(
123
131
if self .annotators_config .exists ():
124
132
break
125
133
126
- self .annotators = self ._initialize_annotators ()
134
+ self .annotators = self ._initialize_annotators (** single_annotator_kwargs )
127
135
self .df_annotations = None
128
136
137
+ other_output_keys_to_keep = [c .format (annotation_key = self .annotation_key ) for c in other_output_keys_to_keep ]
138
+ other_input_keys_to_keep = [c .format (annotation_key = self .annotation_key ) for c in other_input_keys_to_keep ]
129
139
self .other_input_keys_to_keep = self ._get_other_input_keys_to_keep (other_input_keys_to_keep )
130
140
self .other_output_keys_to_keep = self ._get_other_output_keys_to_keep (other_output_keys_to_keep )
131
141
self .other_keys_to_keep = self .other_output_keys_to_keep + self .other_input_keys_to_keep
@@ -148,6 +158,11 @@ def annotation_key(self) -> str:
148
158
"""How to refer to the annotations, this will be the key for annotations in the output."""
149
159
return "annotation"
150
160
161
+ @property
162
+ def completion_key (self ) -> str :
163
+ """How to refer to the raw completions, this will be the key for raw completions in the output."""
164
+ return f"{ self .annotation_key } _raw_completion"
165
+
151
166
@property
152
167
def random_seed_keys (self ) -> list [str ]:
153
168
"""What key / column to seed on for the random generator."""
@@ -227,7 +242,7 @@ def _initialize_annotators_config(self, annotators_config):
227
242
228
243
return annotators_config
229
244
230
- def _initialize_annotators (self ) -> dict [str , "SingleAnnotator" ]:
245
+ def _initialize_annotators (self , ** kwargs ) -> dict [str , "SingleAnnotator" ]:
231
246
"""Load all the configs and prompts if necessary."""
232
247
annotators_config = utils .load_configs (self .annotators_config )
233
248
try :
@@ -241,7 +256,9 @@ def _initialize_annotators(self) -> dict[str, "SingleAnnotator"]:
241
256
seed = self .seed ,
242
257
base_dir = base_dir ,
243
258
annotation_column = self .annotation_key ,
259
+ completion_column = self .completion_key ,
244
260
** annotator_config ,
261
+ ** kwargs ,
245
262
)
246
263
for name , annotator_config in annotators_config .items ()
247
264
}
@@ -311,8 +328,8 @@ def _annotate(self, df_to_annotate: pd.DataFrame, **decoding_kwargs) -> pd.DataF
311
328
]
312
329
# if df_to_annotate "raw_completion" is a dict, put it back to a json string so that you can reparse it
313
330
# TODO: this is for backward compatibility, remove in the future
314
- if "raw_completion" in df_to_annotate .columns :
315
- df_to_annotate ["raw_completion" ] = df_to_annotate ["raw_completion" ].apply (
331
+ if self . completion_key in df_to_annotate .columns :
332
+ df_to_annotate [self . completion_key ] = df_to_annotate [self . completion_key ].apply (
316
333
lambda x : json .dumps (x ) if isinstance (x , dict ) else x
317
334
)
318
335
@@ -583,11 +600,11 @@ class SingleAnnotator:
583
600
annotation_column : str, optional
584
601
Name of the annotation column in the output dataframe.
585
602
586
- is_store_raw_completions : bool , optional
587
- Whether to store raw completions at `"raw_completion"` column in the output dataframe. Note that raw_completion
588
- will not be modified by the postprocessors. E.g. if we switch the columns output_1 and output_2 in the prompt
589
- then the raw completion will show the switched order, which makes interpretation harder. This should
590
- nevertheless not be an issue when using reapply_parsing because of seeding.
603
+ completion_column : str , optional
604
+ Name of the raw completion column in the output dataframe. If None will not store the raw completions. Note that
605
+ raw_completion will not be modified by the postprocessors. E.g. if we switch the columns output_1 and output_2
606
+ in the prompt then the raw completion will show the switched order, which makes interpretation harder. This
607
+ should nevertheless not be an issue when using reapply_parsing because of seeding.
591
608
592
609
processors_to_kwargs : Sequence[dict(str, dict)], optional
593
610
A dictionary of BaseProcessor objects to apply for preprocessing the dataframe before making the prompts and
@@ -599,6 +616,9 @@ class SingleAnnotator:
599
616
600
617
completion_key : str, optional
601
618
Key of the output of `fn_completions` to use for parsing the completions into annotations.
619
+
620
+ packages_for_which_to_show_version : Sequence[str], optional
621
+ List of packages for which to show the version in the metadata of the completions.
602
622
"""
603
623
604
624
def __init__ (
@@ -613,10 +633,12 @@ def __init__(
613
633
batch_size : int = 1 ,
614
634
base_dir : types .AnyPath = constants .EVALUATORS_CONFIG_DIR ,
615
635
annotation_column : str = "annotation" ,
616
- is_store_raw_completions : bool = True ,
636
+ completion_column : Optional [ str ] = "raw_completion" ,
617
637
processors_to_kwargs : Optional [dict [str , dict ]] = None ,
618
638
is_add_default_processors : bool = True ,
619
639
completion_key : str = "completions" ,
640
+ packages_for_which_to_show_version : Optional [Sequence [str ]] = ("alpaca_eval" ,),
641
+ prfx_to_completion_cols : Optional [str ] = "{annotation_column}_" ,
620
642
# The following two keys are only for the documentation
621
643
pretty_name : Optional [str ] = None ,
622
644
link : Optional [str ] = None ,
@@ -637,7 +659,11 @@ def __init__(
637
659
self .is_shuffle = is_shuffle
638
660
self .batch_size = batch_size
639
661
self .annotation_column = annotation_column
640
- self .completion_column = "raw_completion" if is_store_raw_completions else None
662
+ self .completion_column = completion_column
663
+ self .packages_for_which_to_show_version = packages_for_which_to_show_version
664
+ if prfx_to_completion_cols is None :
665
+ prfx_to_completion_cols = ""
666
+ self .prfx_to_completion_cols = prfx_to_completion_cols .format (annotation_column = annotation_column )
641
667
642
668
self .is_add_default_processors = is_add_default_processors
643
669
self .processors = []
@@ -690,9 +716,14 @@ def __call__(self, df_to_annotate: pd.DataFrame, **decoding_kwargs) -> pd.DataFr
690
716
# prompts and completions here will not be the same length as the dataframe due to batching
691
717
prompts , df_to_annotate = self ._make_prompts (df_to_annotate )
692
718
completions = self .fn_completions (prompts = prompts , ** self .completions_kwargs , ** decoding_kwargs )
719
+ self ._add_metadata_to_completions_ (completions )
720
+ completions = {
721
+ f"{ self .prfx_to_completion_cols } { k } " if k != self .completion_key else k : v
722
+ for k , v in completions .items ()
723
+ }
693
724
694
725
for k , v in completions .items ():
695
- if k != "completions" :
726
+ if k != self . completion_key :
696
727
if self .batch_size != 1 and (len (df_to_annotate ) == len (v ) * self .batch_size ):
697
728
v = [el for el in v for _ in range (self .batch_size )]
698
729
df_to_annotate [k ] = v
@@ -735,7 +766,7 @@ def _search_processor(self, name: Union[str, Type["processors.BaseProcessor"]])
735
766
return name
736
767
737
768
def _get_prompt_template (self , prompt_template : types .AnyPath ):
738
- return utils .read_or_return (self .base_dir / prompt_template )
769
+ return utils .read_or_return (prompt_template , relative_to = self .base_dir )
739
770
740
771
def _make_prompts (
741
772
self , df_to_annotate : pd .DataFrame , prompt_template : Optional [str ] = None
@@ -762,6 +793,12 @@ def _make_prompts(
762
793
prompt_template = self .prompt_template
763
794
return utils .make_prompts (df = df_to_annotate , template = prompt_template , batch_size = self .batch_size )
764
795
796
+ def _add_metadata_to_completions_ (self , completions : dict [str , Any ]):
797
+ """Add metadata to the completions."""
798
+ completions ["date" ] = datetime .now ().isoformat ()
799
+ if self .packages_for_which_to_show_version is not None :
800
+ completions ["version" ] = utils .get_multi_package_version (self .packages_for_which_to_show_version )
801
+
765
802
def _preprocess (self , df_to_annotate : pd .DataFrame ) -> pd .DataFrame :
766
803
"""Preprocess the examples before annotating. In particular, takes care of all the randomization."""
767
804
0 commit comments