This endpoint uses AI vision to find a UI element on the screen based on its description and performs a right-click action at that location. The endpoint will take a screenshot if one is not provided, analyze the image to find the described element, and then perform the right-click.
Token Usage: This endpoint consumes 50-100 Smooth Operator API tokens per invocation. The exact token cost depends on the complexity of the element identification process. Clear, unique element descriptions typically result in more efficient token usage.
The request requires a description of the UI element you want to right-click. Optionally, you can also provide a base64 encoded screenshot to analyze. If not provided, the server will take a screenshot automatically.
{ "taskDescription": "string", // Description of the UI element to find and right-click "imageBase64": "string", // Optional base64-encoded screenshot image "mechanism": "string" // Optional mechanism to use for vision detection }
Parameter | Type | Required | Description |
---|---|---|---|
taskDescription | string | Yes | Description of the UI element to find and right-click (e.g., "the icon of the file ReadMe.txt") |
imageBase64 | string | No | Optional base64-encoded screenshot image. If not provided, the system will automatically take a screenshot. |
mechanism | string | No | Optional mechanism to use for vision detection. Available options: "screengrasp2" (default), "screengrasp2-low", "screengrasp2-medium", "screengrasp2-high", "llabs", "anthropic-computer-use", "openai-computer-use", "qwen25-vl-72b". See Screengrasp documentation for more details. |
{
"taskDescription": "the file icon",
"mechanism": "screengrasp2"
}
The response includes information about whether the right-click action was successful and details about the execution.
{ "Success": boolean, // Whether the operation was successful "Message": "string", // Result message or error description "Timestamp": "string" // ISO timestamp of the operation }
Field | Type | Description |
---|---|---|
Success | boolean | Indicates whether the right-click action was executed successfully |
Message | string | Description of the result or error message if failed |
Timestamp | string | ISO 8601 timestamp of when the action was executed |
{
"Success": true,
"Message": "Right-click action executed successfully at coordinates (542, 387)",
"Timestamp": "2025-03-12T09:35:42.123Z"
}
import requests import json import base64 from PIL import ImageGrab import io def right_click_element_by_description(api_url, api_key, element_description, use_screenshot=False): """ Right-clicks an element on the screen based on its description using AI vision. Args: api_url: Base URL of the Smooth Operator Tools API api_key: API key for authentication element_description: Text description of the element to right-click use_screenshot: Whether to capture and send a screenshot (default: False) Returns: The API response as a dictionary """ headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } request_data = { "taskDescription": element_description } # Optionally capture and include a screenshot if use_screenshot: # Capture the entire screen screenshot = ImageGrab.grab() # Convert the image to base64 buffer = io.BytesIO() screenshot.save(buffer, format="PNG") img_str = base64.b64encode(buffer.getvalue()).decode('utf-8') # Add the base64 image to the request request_data["imageBase64"] = f"data:image/png;base64,{img_str}" response = requests.post( f"{api_url}/tools-api/mouse/rightclick-by-description", headers=headers, data=json.dumps(request_data) ) return response.json() # Example usage if __name__ == "__main__": API_URL = "http://localhost:8080" API_KEY = "your_api_key_here" # Example: Right-click on a file icon result = right_click_element_by_description( API_URL, API_KEY, "the icon of the file ReadMe.txt" ) print(json.dumps(result, indent=2))
interface RightClickByDescriptionRequest { taskDescription: string; imageBase64?: string; mechanism?: string; } interface ActionResponse { success: boolean; message: string; timestamp: string; } /** * Right-clicks an element on the screen based on its description using AI vision * * @param apiUrl - Base URL of the Smooth Operator Tools API * @param apiKey - API key for authentication * @param elementDescription - Description of the element to right-click * @returns Promise with the action response */ async function rightClickElementByDescription( apiUrl: string, apiKey: string, elementDescription: string ): Promise{ const endpoint = `${apiUrl}/tools-api/mouse/rightclick-by-description`; const requestData: RightClickByDescriptionRequest = { taskDescription: elementDescription, // Note: This example does not capture a screenshot. // In a browser environment, you could use the HTML5 Canvas API to capture a screenshot. }; const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` }, body: JSON.stringify(requestData) }); if (!response.ok) { throw new Error(`API request failed: ${response.statusText}`); } return await response.json() as ActionResponse; } // Example usage async function example() { const API_URL = 'http://localhost:8080'; const API_KEY = 'your_api_key_here'; try { const result = await rightClickElementByDescription( API_URL, API_KEY, 'the icon of the file ReadMe.txt' ); console.log('Right-click result:', result); if (result.success) { console.log('Right-click action completed successfully'); } else { console.error('Failed to right-click element:', result.message); } } catch (error) { console.error('Error executing right-click by description:', error); } }
using System; using System.Net.Http; using System.Net.Http.Headers; using System.Text; using System.Text.Json; using System.Threading.Tasks; public class ToolsServerClient { private readonly HttpClient _httpClient; private readonly string _apiKey; public ToolsServerClient(string apiKey) { _httpClient = new HttpClient(); _apiKey = apiKey; _httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", _apiKey); } ////// Right-clicks an element on the screen based on its description using AI vision /// /// Description of the element to right-click ///Response indicating success or failure public async TaskRightClickElementByDescriptionAsync(string elementDescription) { var request = new RightClickByDescriptionRequest { TaskDescription = elementDescription, // Note: This example does not include a screenshot. // You could capture one using System.Drawing or other libraries. }; return await SendRequestAsync ( "/tools-api/mouse/rightclick-by-description", request); } private async Task SendRequestAsync (string endpoint, TRequest requestData) { var json = JsonSerializer.Serialize(requestData, new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase }); var content = new StringContent(json, Encoding.UTF8, "application/json"); var response = await _httpClient.PostAsync($"http://localhost:8080{endpoint}", content); response.EnsureSuccessStatusCode(); var responseJson = await response.Content.ReadAsStringAsync(); return JsonSerializer.Deserialize (responseJson, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); } } public class RightClickByDescriptionRequest { public string TaskDescription { get; set; } public string ImageBase64 { get; set; } public string Mechanism { get; set; } } public class ActionResponse { public bool Success { get; set; } public string Message { get; set; } public DateTime Timestamp { get; set; } } // Example usage public class Program { public static async Task Main() { var client = new ToolsServerClient("your_api_key_here"); try { var result = await client.RightClickElementByDescriptionAsync("the icon of the file ReadMe.txt"); Console.WriteLine($"Success: {result.Success}"); Console.WriteLine($"Message: {result.Message}"); Console.WriteLine($"Timestamp: {result.Timestamp}"); } catch (Exception ex) { Console.WriteLine($"Error: {ex.Message}"); } } }