diff --git a/operate/config.py b/operate/config.py index b97b20ac..8337f696 100644 --- a/operate/config.py +++ b/operate/config.py @@ -78,7 +78,7 @@ def initialize_google(self): ) api_key = os.getenv("GOOGLE_API_KEY") genai.configure(api_key=api_key, transport="rest") - model = genai.GenerativeModel("gemini-pro-vision") + model = genai.GenerativeModel("gemini-exp-1206") return model @@ -116,7 +116,7 @@ def validation(self, model, voice_mode): or model == "o1-with-ocr", ) self.require_api_key( - "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" + "GOOGLE_API_KEY", "Google API key", model == "gemini-exp-1206" ) self.require_api_key( "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" diff --git a/operate/models/apis.py b/operate/models/apis.py index d0ccb0c4..51bce99f 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -162,17 +162,29 @@ def call_gemini_pro_vision(messages, objective): if config.verbose: print("[call_gemini_pro_vision] model", model) - response = model.generate_content([prompt, Image.open(screenshot_filename)]) + response = model.generate_content( + [prompt, Image.open(screenshot_filename)]) content = response.text[1:] if config.verbose: print("[call_gemini_pro_vision] response", response) - print("[call_gemini_pro_vision] content", content) + print("[call_gemini_pro_vision] raw content before parsing:", content) + + content = content.strip("`").strip() + if not content: + raise ValueError( + "[call_gemini_pro_vision] Empty content received.") + + try: + content = json.loads(content) + except json.JSONDecodeError as e: + print(f"[ERROR] JSONDecodeError: {e}") + print(f"[DEBUG] Problematic content: {content}") + raise - content = json.loads(content) if config.verbose: print( - "[get_next_action][call_gemini_pro_vision] content", + "[get_next_action][call_gemini_pro_vision] parsed content", content, ) @@ -180,7 +192,8 @@ def call_gemini_pro_vision(messages, objective): except Exception as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" + f"{ANSI_GREEN}[Self-Operating Computer]{ + ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" ) if config.verbose: print("[Self-Operating Computer][Operate] error", e) @@ -188,6 +201,7 @@ def call_gemini_pro_vision(messages, objective): return call_gpt_4o(messages) + async def call_gpt_4o_with_ocr(messages, objective, model): if config.verbose: print("[call_gpt_4o_with_ocr]")