|
23 | 23 | DIRNAME = Path(__file__).absolute().resolve().parent |
24 | 24 |
|
25 | 25 |
|
26 | | -def process_results(current_result, sota_result): |
27 | | - # Convert the results to dataframes |
28 | | - current_df = pd.DataFrame(current_result) |
29 | | - sota_df = pd.DataFrame(sota_result) |
30 | | - |
31 | | - # Combine the dataframes on the Metric index |
32 | | - combined_df = pd.concat([current_df, sota_df], axis=1) |
33 | | - combined_df.columns = ["current_df", "sota_df"] |
34 | | - |
35 | | - combined_df["the largest"] = combined_df.apply( |
36 | | - lambda row: "sota_df" |
37 | | - if row["sota_df"] > row["current_df"] |
38 | | - else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"), |
39 | | - axis=1, |
40 | | - ) |
41 | | - |
42 | | - # Add a note about metric direction |
43 | | - combined_df["Note"] = "Direction of improvement (higher/lower is better) should be judged per metric" |
44 | | - |
45 | | - return combined_df |
46 | | - |
47 | | - |
48 | 26 | class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback): |
| 27 | + def process_results(self, current_result, sota_result): |
| 28 | + # Convert the results to dataframes |
| 29 | + current_df = pd.DataFrame(current_result) |
| 30 | + sota_df = pd.DataFrame(sota_result) |
| 31 | + |
| 32 | + # Combine the dataframes on the Metric index |
| 33 | + combined_df = pd.concat([current_df, sota_df], axis=1) |
| 34 | + combined_df.columns = ["current_df", "sota_df"] |
| 35 | + |
| 36 | + # combined_df["the largest"] = combined_df.apply( |
| 37 | + # lambda row: "sota_df" |
| 38 | + # if row["sota_df"] > row["current_df"] |
| 39 | + # else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"), |
| 40 | + # axis=1, |
| 41 | + # ) |
| 42 | + |
| 43 | + # Add a note about metric direction |
| 44 | + evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower" |
| 45 | + combined_df[ |
| 46 | + "Note" |
| 47 | + ] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics." |
| 48 | + |
| 49 | + return combined_df |
| 50 | + |
49 | 51 | def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback: |
50 | 52 | """ |
51 | 53 | The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM). |
@@ -77,10 +79,10 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac |
77 | 79 | if exp.based_experiments: |
78 | 80 | sota_result = exp.based_experiments[-1].result |
79 | 81 | # Process the results to filter important metrics |
80 | | - combined_result = process_results(current_result, sota_result) |
| 82 | + combined_result = self.process_results(current_result, sota_result) |
81 | 83 | else: |
82 | 84 | # If there are no based experiments, we'll only use the current result |
83 | | - combined_result = process_results(current_result, current_result) # Compare with itself |
| 85 | + combined_result = self.process_results(current_result, current_result) # Compare with itself |
84 | 86 | print("Warning: No previous experiments to compare against. Using current result as baseline.") |
85 | 87 |
|
86 | 88 | available_features = { |
@@ -113,35 +115,34 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac |
113 | 115 |
|
114 | 116 | # Prepare render dictionary |
115 | 117 | render_dict = { |
116 | | - "context": self.scen.get_scenario_all_desc(), |
117 | 118 | "last_hypothesis": trace.hist[-1][0] if trace.hist else None, |
118 | 119 | "last_task_and_code": last_task_and_code, |
119 | 120 | "last_result": trace.hist[-1][1].result if trace.hist else None, |
| 121 | + "sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description |
| 122 | + if exp.based_experiments |
| 123 | + else None, |
| 124 | + "sota_result": exp.based_experiments[-1].result if exp.based_experiments else None, |
120 | 125 | "hypothesis": hypothesis, |
121 | 126 | "exp": exp, |
122 | | - "model_code": model_code, |
123 | | - "available_features": available_features, |
124 | | - "combined_result": combined_result, |
125 | | - "hypothesis_text": hypothesis_text, |
126 | | - "task_details": tasks_factors, |
| 127 | + "model_code": model_code, # This turn |
| 128 | + "available_features": available_features, # This turn |
| 129 | + "combined_result": combined_result, # This turn and sota |
| 130 | + "hypothesis_text": hypothesis_text, # This turn |
| 131 | + "task_details": tasks_factors, # This turn |
127 | 132 | } |
128 | 133 |
|
129 | | - # Generate the user prompt |
130 | 134 | usr_prompt = ( |
131 | 135 | Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict) |
132 | 136 | ) |
133 | 137 |
|
134 | | - # Call the APIBackend to generate the response for hypothesis feedback |
135 | 138 | response = APIBackend().build_messages_and_create_chat_completion( |
136 | 139 | user_prompt=usr_prompt, |
137 | 140 | system_prompt=sys_prompt, |
138 | 141 | json_mode=True, |
139 | 142 | ) |
140 | 143 |
|
141 | | - # Parse the JSON response to extract the feedback |
142 | 144 | response_json = json.loads(response) |
143 | 145 |
|
144 | | - # Extract fields from JSON response |
145 | 146 | observations = response_json.get("Observations", "No observations provided") |
146 | 147 | hypothesis_evaluation = response_json.get("Feedback for Hypothesis", "No feedback provided") |
147 | 148 | new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided") |
|
0 commit comments