Uses AI vision to find and double-click a UI element based on its description. This endpoint utilizes advanced AI models to analyze a screenshot, locate a UI element matching the provided description, and perform a double-click action at the identified coordinates.
This action is particularly useful for:
The AI vision system understands natural language descriptions of UI elements and can identify buttons, icons, text fields, links, and other interactive elements without requiring precise coordinates or technical identifiers.
Note: For the best results, the element description should be clear and specific to help the AI accurately identify the target element on the screen.
Token Usage: Each invocation of this endpoint consumes 50-100 Smooth Operator API tokens. The token consumption varies based on the complexity of identifying the described element. More specific descriptions generally require fewer tokens to process.
The request should include a natural language description of the UI element to double-click. Optionally, you can provide a base64-encoded screenshot image; if not provided, the system will automatically capture a screenshot.
{ "taskDescription": "string", // Description of the UI element to find and double-click "imageBase64": "string", // Optional base64-encoded screenshot image "mechanism": "string" // Optional mechanism to use for vision detection }
Parameter | Type | Required | Description |
---|---|---|---|
taskDescription | string | Yes | A descriptive text that identifies the UI element to double-click, such as "the trash can icon" or "the Excel file on the desktop" |
imageBase64 | string | No | Optional base64-encoded screenshot image. If not provided, the system will automatically take a screenshot. |
mechanism | string | No | Optional mechanism to use for vision detection. Available options: "screengrasp2" (default), "screengrasp2-low", "screengrasp2-medium", "screengrasp2-high", "llabs", "anthropic-computer-use", "openai-computer-use", "qwen25-vl-72b". See Screengrasp documentation for more details. |
{
"taskDescription": "the folder icon",
"mechanism": "screengrasp2"
}
The response indicates whether the double-click operation was successful, along with additional information about the action performed.
{ "Success": boolean, // Whether the operation was successful "Message": "string", // Result message or error description "Timestamp": "string" // ISO timestamp of the operation }
Field | Type | Description |
---|---|---|
Success | boolean | Indicates whether the operation was successful |
Message | string | A message describing the result of the operation, including the coordinates where the double-click was performed |
Timestamp | string | The timestamp when the action was executed (ISO format) |
{
"Success": true,
"Message": "Double-clicked at coordinates (584, 322)",
"Timestamp": "2023-11-15T14:28:35.123Z"
}
{
"Success": false,
"Message": "Failed to find element: Could not identify any UI element matching the description 'the trash can icon'",
"Timestamp": "2023-11-15T14:28:35.123Z"
}
import requests import json import base64 from PIL import ImageGrab import io def double_click_element_by_description(api_key, element_description, image_base64=None, mechanism="screengrasp2"): url = "http://localhost:54321/tools-api/mouse/doubleclick-by-description" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } # Prepare the request payload payload = { "taskDescription": element_description, "mechanism": mechanism } # Include screenshot if provided if image_base64: payload["imageBase64"] = image_base64 response = requests.post(url, headers=headers, json=payload) if response.status_code == 200: return response.json() else: print(f"Error: {response.status_code}") print(response.text) return None # Example usage with automatic screenshot api_key = "your_api_key_here" result = double_click_element_by_description(api_key, "the trash can icon") if result and result.get("Success"): print(f"Success: {result.get('Message')}") else: print(f"Failed: {result.get('Message') if result else 'Unknown error'}") # Example with manual screenshot def capture_and_encode_screenshot(): screenshot = ImageGrab.grab() buffer = io.BytesIO() screenshot.save(buffer, format="PNG") return f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}" manual_screenshot = capture_and_encode_screenshot() result_with_screenshot = double_click_element_by_description(api_key, "the Excel icon on desktop", manual_screenshot)
interface ActionResponse { Success: boolean; Message: string; Timestamp: string; } async function doubleClickElementByDescription( apiKey: string, elementDescription: string, imageBase64?: string, mechanism: string = "screengrasp2" ): Promise{ const url = "http://localhost:54321/tools-api/mouse/doubleclick-by-description"; try { // Prepare the request payload const payload: any = { taskDescription: elementDescription, mechanism: mechanism }; // Include screenshot if provided if (imageBase64) { payload.imageBase64 = imageBase64; } const response = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` }, body: JSON.stringify(payload) }); if (!response.ok) { console.error(`Error: ${response.status}`); console.error(await response.text()); return null; } return await response.json() as ActionResponse; } catch (error) { console.error("Failed to double-click element:", error); return null; } } // Example usage async function example() { const apiKey = "your_api_key_here"; // Basic usage without providing a screenshot const result = await doubleClickElementByDescription(apiKey, "the trash can icon"); if (result?.Success) { console.log(`Success: ${result.Message}`); } else { console.log(`Failed: ${result?.Message || 'Unknown error'}`); } // Example with a custom screenshot (if you have one) // const customScreenshot = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."; // const resultWithScreenshot = await doubleClickElementByDescription( // apiKey, // "the Excel icon on desktop", // customScreenshot // ); }
using System; using System.Net.Http; using System.Text; using System.Text.Json; using System.Threading.Tasks; public class ToolsServerClient { private readonly HttpClient _httpClient; private readonly string _apiKey; public ToolsServerClient(string apiKey) { _httpClient = new HttpClient { BaseAddress = new Uri("http://localhost:54321") }; _apiKey = apiKey; _httpClient.DefaultRequestHeaders.Add("Authorization", $"Bearer {apiKey}"); } public async TaskDoubleClickElementByDescriptionAsync( string elementDescription, string imageBase64 = null, string mechanism = "screengrasp2") { var request = new ScreenGrasp2Request { TaskDescription = elementDescription, Mechanism = mechanism }; if (imageBase64 != null) { request.ImageBase64 = imageBase64; } var json = JsonSerializer.Serialize(request); var content = new StringContent(json, Encoding.UTF8, "application/json"); var response = await _httpClient.PostAsync("/tools-api/mouse/doubleclick-by-description", content); response.EnsureSuccessStatusCode(); var jsonResponse = await response.Content.ReadAsStringAsync(); return JsonSerializer.Deserialize (jsonResponse, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); } } // Request and response models public class ScreenGrasp2Request { public string TaskDescription { get; set; } public string ImageBase64 { get; set; } public string Mechanism { get; set; } } public class ActionResponse { public bool Success { get; set; } public string Message { get; set; } public DateTime Timestamp { get; set; } } // Example usage public class Example { public static async Task Main() { var client = new ToolsServerClient("your_api_key_here"); // Basic usage without providing a screenshot var result = await client.DoubleClickElementByDescriptionAsync("the trash can icon"); if (result.Success) { Console.WriteLine($"Success: {result.Message}"); } else { Console.WriteLine($"Failed: {result.Message}"); } // Example with a custom screenshot (if you have one) // string customScreenshot = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."; // var resultWithScreenshot = await client.DoubleClickElementByDescriptionAsync( // "the Excel icon on desktop", // customScreenshot // ); } }