This endpoint uses AI vision to find and click a UI element based on its textual description. The system takes a screenshot of the current screen, analyzes it with vision AI to locate the described element, and performs a mouse click at the coordinates of the identified element.
This is especially useful for agent systems that need to interact with dynamic user interfaces where precise coordinates may change, but visual elements remain consistently identifiable through their appearance.
Token Usage: Each invocation of this endpoint consumes 50-100 Smooth Operator API tokens, depending on the complexity of identifying the described element. More specific and clear descriptions typically result in lower token usage.
The request should be a JSON object with the following properties:
{ "taskDescription": "string", // Description of the UI element to find and click "imageBase64": "string", // Optional base64-encoded screenshot image "mechanism": "string" // Optional mechanism to use for vision detection }
Parameter | Type | Required | Description |
---|---|---|---|
taskDescription | string | Yes | A descriptive text of the UI element to find and click (e.g., "the File menu", "the trash can icon", "the Send button"). |
imageBase64 | string | No | Optional base64-encoded screenshot image. If not provided, the system will automatically take a screenshot. |
mechanism | string | No | Optional mechanism to use for vision detection. Available options: "screengrasp2" (default), "screengrasp2-low", "screengrasp2-medium", "screengrasp2-high", "llabs", "anthropic-computer-use", "openai-computer-use", "qwen25-vl-72b". See Screengrasp documentation for more details. |
{
"taskDescription": "the File menu",
"mechanism": "screengrasp2"
}
{ "success": boolean, // Whether the operation was successful "message": "string", // Result message or error description "timestamp": "string" // ISO timestamp of the operation }
Field | Type | Description |
---|---|---|
success | boolean | Indicates whether the operation was successful. |
message | string | A message describing the result of the operation. Contains an error message if the operation failed or details about the click location if successful. |
timestamp | string | ISO timestamp indicating when the operation was performed. |
{
"success": true,
"message": "click executed at (45, 23)",
"timestamp": "2025-03-12T09:20:45.123Z"
}
{
"success": false,
"message": "Failed to find element: ",
"timestamp": "2025-03-12T09:20:45.123Z"
}
import requests import json def click_element_by_description(api_url, api_key, element_description): """ Use AI vision to find and click a UI element based on its description Args: api_url (str): The base URL of the Tools Server API api_key (str): The API key for authentication element_description (str): Description of the UI element to click Returns: dict: Response from the API containing success status and coordinates """ endpoint = f"{api_url}/tools-api/mouse/click-by-description" # Prepare the request payload payload = { "taskDescription": element_description } # Set up the headers with authentication headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } # Send the request response = requests.post(endpoint, headers=headers, json=payload) # Parse and return the response return response.json() # Example usage if __name__ == "__main__": API_URL = "http://localhost:8080" API_KEY = "your_api_key_here" # Example: Find and click the File menu result = click_element_by_description(API_URL, API_KEY, "the File menu") print(f"Success: {result['success']}") print(f"Message: {result['message']}") print(f"Timestamp: {result['timestamp']}")
interface ClickByDescriptionRequest { taskDescription: string; imageBase64?: string; mechanism?: string; } interface ClickByDescriptionResponse { success: boolean; message: string; timestamp: string; } /** * Use AI vision to find and click a UI element based on its description * * @param apiUrl - Base URL of the Tools Server API * @param apiKey - API key for authentication * @param elementDescription - Description of the UI element to click * @returns Promise with the response containing success status and coordinates */ async function clickElementByDescription( apiUrl: string, apiKey: string, elementDescription: string ): Promise{ const endpoint = `${apiUrl}/tools-api/mouse/click-by-description`; // Prepare the request payload const payload: ClickByDescriptionRequest = { taskDescription: elementDescription }; // Make the API request const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` }, body: JSON.stringify(payload) }); // Parse and return the response return await response.json(); } // Example usage async function example() { const API_URL = 'http://localhost:8080'; const API_KEY = 'your_api_key_here'; try { // Example: Find and click the File menu const result = await clickElementByDescription(API_URL, API_KEY, 'the File menu'); console.log(`Success: ${result.success}`); console.log(`Message: ${result.message}`); console.log(`Timestamp: ${result.timestamp}`); } catch (error) { console.error('API request failed:', error); } }
using System; using System.Net.Http; using System.Net.Http.Headers; using System.Text; using System.Text.Json; using System.Threading.Tasks; namespace SmoothOperatorClient { public class ToolsServerClient { private readonly HttpClient _httpClient; private readonly string _apiKey; public ToolsServerClient(string apiUrl, string apiKey) { _httpClient = new HttpClient { BaseAddress = new Uri(apiUrl) }; _apiKey = apiKey; _httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", _apiKey); } ////// Use AI vision to find and click a UI element based on its description /// /// Description of the UI element to click ///Response containing success status and click coordinates public async TaskClickElementByDescriptionAsync(string elementDescription) { var request = new ClickByDescriptionRequest { TaskDescription = elementDescription }; var content = new StringContent( JsonSerializer.Serialize(request), Encoding.UTF8, "application/json"); var response = await _httpClient.PostAsync("/tools-api/mouse/click-by-description", content); response.EnsureSuccessStatusCode(); var responseBody = await response.Content.ReadAsStringAsync(); return JsonSerializer.Deserialize (responseBody); } } public class ClickByDescriptionRequest { public string TaskDescription { get; set; } public string ImageBase64 { get; set; } public string Mechanism { get; set; } } public class ClickByDescriptionResponse { public bool Success { get; set; } public string Message { get; set; } public DateTime Timestamp { get; set; } } // Example usage class Program { static async Task Main(string[] args) { var client = new ToolsServerClient("http://localhost:8080", "your_api_key_here"); try { // Example: Find and click the File menu var result = await client.ClickElementByDescriptionAsync("the File menu"); Console.WriteLine($"Success: {result.Success}"); Console.WriteLine($"Message: {result.Message}"); Console.WriteLine($"Timestamp: {result.Timestamp}"); } catch (Exception ex) { Console.WriteLine($"API request failed: {ex.Message}"); } } } }
{
"taskDescription": "the File menu in the top left corner"
}
{
"taskDescription": "the blue Submit button"
}
{
"taskDescription": "the trash can icon in the toolbar"
}