Skip to content

Commit cc9a8c1

Browse files
xisen-wWinstonLiyt
andauthored
feat: better feedback & evaluation (#346)
* Updated new keys for evaluation * fix the bug in feedback --------- Co-authored-by: WinstonLiye <1957922024@qq.com>
1 parent c18cc6a commit cc9a8c1

4 files changed

Lines changed: 125 additions & 102 deletions

File tree

‎rdagent/scenarios/kaggle/developer/feedback.py‎

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,29 +23,31 @@
2323
DIRNAME = Path(__file__).absolute().resolve().parent
2424

2525

26-
def process_results(current_result, sota_result):
27-
# Convert the results to dataframes
28-
current_df = pd.DataFrame(current_result)
29-
sota_df = pd.DataFrame(sota_result)
30-
31-
# Combine the dataframes on the Metric index
32-
combined_df = pd.concat([current_df, sota_df], axis=1)
33-
combined_df.columns = ["current_df", "sota_df"]
34-
35-
combined_df["the largest"] = combined_df.apply(
36-
lambda row: "sota_df"
37-
if row["sota_df"] > row["current_df"]
38-
else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
39-
axis=1,
40-
)
41-
42-
# Add a note about metric direction
43-
combined_df["Note"] = "Direction of improvement (higher/lower is better) should be judged per metric"
44-
45-
return combined_df
46-
47-
4826
class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
27+
def process_results(self, current_result, sota_result):
28+
# Convert the results to dataframes
29+
current_df = pd.DataFrame(current_result)
30+
sota_df = pd.DataFrame(sota_result)
31+
32+
# Combine the dataframes on the Metric index
33+
combined_df = pd.concat([current_df, sota_df], axis=1)
34+
combined_df.columns = ["current_df", "sota_df"]
35+
36+
# combined_df["the largest"] = combined_df.apply(
37+
# lambda row: "sota_df"
38+
# if row["sota_df"] > row["current_df"]
39+
# else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
40+
# axis=1,
41+
# )
42+
43+
# Add a note about metric direction
44+
evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower"
45+
combined_df[
46+
"Note"
47+
] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
48+
49+
return combined_df
50+
4951
def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
5052
"""
5153
The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
@@ -77,10 +79,10 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
7779
if exp.based_experiments:
7880
sota_result = exp.based_experiments[-1].result
7981
# Process the results to filter important metrics
80-
combined_result = process_results(current_result, sota_result)
82+
combined_result = self.process_results(current_result, sota_result)
8183
else:
8284
# If there are no based experiments, we'll only use the current result
83-
combined_result = process_results(current_result, current_result) # Compare with itself
85+
combined_result = self.process_results(current_result, current_result) # Compare with itself
8486
print("Warning: No previous experiments to compare against. Using current result as baseline.")
8587

8688
available_features = {
@@ -113,35 +115,34 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
113115

114116
# Prepare render dictionary
115117
render_dict = {
116-
"context": self.scen.get_scenario_all_desc(),
117118
"last_hypothesis": trace.hist[-1][0] if trace.hist else None,
118119
"last_task_and_code": last_task_and_code,
119120
"last_result": trace.hist[-1][1].result if trace.hist else None,
121+
"sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description
122+
if exp.based_experiments
123+
else None,
124+
"sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
120125
"hypothesis": hypothesis,
121126
"exp": exp,
122-
"model_code": model_code,
123-
"available_features": available_features,
124-
"combined_result": combined_result,
125-
"hypothesis_text": hypothesis_text,
126-
"task_details": tasks_factors,
127+
"model_code": model_code, # This turn
128+
"available_features": available_features, # This turn
129+
"combined_result": combined_result, # This turn and sota
130+
"hypothesis_text": hypothesis_text, # This turn
131+
"task_details": tasks_factors, # This turn
127132
}
128133

129-
# Generate the user prompt
130134
usr_prompt = (
131135
Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict)
132136
)
133137

134-
# Call the APIBackend to generate the response for hypothesis feedback
135138
response = APIBackend().build_messages_and_create_chat_completion(
136139
user_prompt=usr_prompt,
137140
system_prompt=sys_prompt,
138141
json_mode=True,
139142
)
140143

141-
# Parse the JSON response to extract the feedback
142144
response_json = json.loads(response)
143145

144-
# Extract fields from JSON response
145146
observations = response_json.get("Observations", "No observations provided")
146147
hypothesis_evaluation = response_json.get("Feedback for Hypothesis", "No feedback provided")
147148
new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")

‎rdagent/scenarios/kaggle/experiment/prompts.yaml‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ kg_description_template:
1111
"Competition Features": "Two-line description of the overall features involved within the competition as background."
1212
"Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
1313
"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
14+
"Evaluation Description": "A brief description for what metrics are used in evaluation. An explanation of whether a higher score is better or lower is better in terms of performance."
15+
"Evaluation Boolean": "True" or "False" (True means the higher score the better (like accuracy); False means the lower value the better (like loss).)
1416
}
1517
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
1618

‎rdagent/scenarios/kaggle/experiment/scenario.py‎

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def __init__(self, competition: str) -> None:
3333
self.competition_features = None
3434
self.submission_specifications = None
3535
self.model_output_channel = None
36+
self.evaluation_desc = None
37+
self.evaluation_metric_direction = None
3638
self._analysis_competition_description()
3739
self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
3840
self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection
@@ -73,12 +75,23 @@ def _analysis_competition_description(self):
7375
"Submission Specifications", "No submission requirements provided"
7476
)
7577
self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
78+
self.evaluation_desc = response_json_analysis.get(
79+
"Evaluation Description", "No evaluation specification provided."
80+
)
81+
self.evaluation_metric_direction = response_json_analysis.get(
82+
"Evaluation Boolean", "No evaluation specification provided."
83+
)
7684

7785
def get_competition_full_desc(self) -> str:
86+
evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
7887
return f"""Competition Type: {self.competition_type}
7988
Competition Description: {self.competition_description}
8089
Target Description: {self.target_description}
8190
Competition Features: {self.competition_features}
91+
Submission Specifications: {self.submission_specifications}
92+
Model Output Channel: {self.model_output_channel}
93+
Evaluation Descriptions: {self.evaluation_desc}
94+
Is the evaluation metric the higher the better: {evaluation_direction}
8295
"""
8396

8497
@property
@@ -99,6 +112,8 @@ def background(self) -> str:
99112
target_description=self.target_description,
100113
competition_features=self.competition_features,
101114
submission_specifications=self.submission_specifications,
115+
evaluation_desc=self.evaluation_desc,
116+
evaluate_bool=self.evaluation_metric_direction,
102117
)
103118
)
104119
return background_prompt
@@ -171,8 +186,9 @@ def simulator(self) -> str:
171186

172187
@property
173188
def rich_style_description(self) -> str:
174-
return """
175-
kaggle scen """
189+
return f"""
190+
This is the Kaggle scenario for the competition: {KAGGLE_IMPLEMENT_SETTING.competition}
191+
"""
176192

177193
def get_scenario_all_desc(self) -> str:
178194
return f"""Background of the scenario:

0 commit comments

Comments
 (0)