Uses AI vision to find UI elements based on natural language descriptions and perform a drag and drop operation from one element to another. This endpoint leverages computer vision AI to identify interface elements from descriptive text, allowing more intuitive interaction with the computer.
Token Usage: Each invocation of this endpoint consumes 50-100 Smooth Operator API tokens, depending on the complexity of identifying both the source and target elements. Complex or ambiguous descriptions may require more tokens.
The request must include descriptions of both the source element to drag and the target destination.
{ "startElementDescription": "string", // Description of the UI element to drag from "endElementDescription": "string", // Description of the destination element or location "imageBase64": "string", // Optional base64-encoded screenshot image "mechanism": "string" // Optional mechanism to use for vision detection }
Parameter | Type | Required | Description |
---|---|---|---|
startElementDescription | string | Yes | A natural language description of the UI element to drag from. For example, "the file icon", "the image", "the blue folder". |
endElementDescription | string | Yes | A natural language description of the destination element or location. For example, "the folder icon", "the trash can", "the document area". |
imageBase64 | string | No | Optional base64-encoded screenshot image. If not provided, the system will automatically take a screenshot. |
mechanism | string | No | Optional mechanism to use for vision detection. Currently, only "screengrasp2" is used regardless of the value provided in this parameter. |
The response indicates whether the drag operation was successful and provides details about the operation. If the operation fails, the message will indicate which element was not found.
{ "success": boolean, // Whether the operation was successful "message": "string", // Result message or error description "timestamp": "string" // ISO timestamp of the operation }
Field | Type | Description |
---|---|---|
success | boolean | Indicates whether the operation was successful. |
message | string | A message describing the result of the operation. If successful, indicates the coordinates of the drag operation. If failed, indicates which element (start or end) could not be found. |
timestamp | string | ISO timestamp indicating when the operation was performed. |
{ "success": true, "message": "Dragged from (450, 300) to (700, 450)", "timestamp": "2023-09-15T14:32:10.123Z" }
import requests import json import base64 from PIL import ImageGrab import io def drag_by_description(start_element_description, end_element_description, image_base64=None, mechanism=None): """ Performs a drag operation from one element to another using AI vision. Args: start_element_description (str): Description of the UI element to drag from end_element_description (str): Description of the destination element or location image_base64 (str, optional): Base64-encoded screenshot image. If None, takes a screenshot mechanism (str, optional): Mechanism to use for vision detection Returns: dict: Response from the server """ # Base URL of the Tools Server base_url = "http://localhost:54321" # Endpoint endpoint = "/tools-api/mouse/drag-by-description" # If no image is provided, take a screenshot if image_base64 is None: # Capture the screen screenshot = ImageGrab.grab() # Convert to base64 buffer = io.BytesIO() screenshot.save(buffer, format="PNG") image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") # Prepare the request headers = { "Content-Type": "application/json", "Authorization": "Bearer YOUR_API_KEY" } payload = { "startElementDescription": start_element_description, "endElementDescription": end_element_description, "imageBase64": image_base64, "mechanism": mechanism } # Send the request response = requests.post( base_url + endpoint, headers=headers, data=json.dumps(payload) ) # Return the response as JSON return response.json() # Example usage result = drag_by_description( "the file icon", "the folder icon", mechanism="screengrasp2" ) print(f"Success: {result['success']}") print(f"Message: {result['message']}") print(f"Timestamp: {result['timestamp']}")
import axios from 'axios'; /** * Performs a drag operation from one element to another using AI vision * * @param startElementDescription - Description of the UI element to drag from * @param endElementDescription - Description of the destination element or location * @param imageBase64 - Optional base64-encoded screenshot image * @param mechanism - Optional mechanism to use for vision detection * @returns Promise with the response data */ async function dragByDescription( startElementDescription: string, endElementDescription: string, imageBase64?: string, mechanism?: string ): Promise<{ success: boolean; message: string; timestamp: string; }> { // Base URL of the Tools Server const baseUrl = 'http://localhost:54321'; // Endpoint const endpoint = '/tools-api/mouse/drag-by-description'; // Request headers const headers = { 'Content-Type': 'application/json', 'Authorization': 'Bearer YOUR_API_KEY' }; // Request payload const payload = { startElementDescription, endElementDescription, imageBase64, // If undefined, server will take a screenshot mechanism }; try { // Send the request const response = await axios.post( baseUrl + endpoint, payload, { headers } ); // Return the response data return response.data; } catch (error) { // Handle any errors console.error('Error performing drag operation:', error); throw error; } } // Example usage async function example() { try { const result = await dragByDescription( 'the file icon', 'the folder icon', undefined, 'screengrasp2' ); console.log(`Success: ${result.success}`); console.log(`Message: ${result.message}`); console.log(`Timestamp: ${result.timestamp}`); } catch (error) { console.error('Failed to perform drag operation:', error); } } example();
using System; using System.Net.Http; using System.Text; using System.Text.Json; using System.Threading.Tasks; public class ToolsServerClient { private readonly HttpClient _httpClient; private readonly string _baseUrl; private readonly string _apiKey; public ToolsServerClient(string baseUrl = "http://localhost:54321", string apiKey = "YOUR_API_KEY") { _baseUrl = baseUrl; _apiKey = apiKey; _httpClient = new HttpClient(); _httpClient.DefaultRequestHeaders.Add("Authorization", $"Bearer {_apiKey}"); } ////// Performs a drag operation from one element to another using AI vision /// /// Description of the UI element to drag from /// Description of the destination element or location /// Optional base64-encoded screenshot image /// Optional mechanism to use for vision detection ///Action response indicating success or failure public async TaskDragByDescriptionAsync( string startElementDescription, string endElementDescription, string imageBase64 = null, string mechanism = null) { // Endpoint var endpoint = "/tools-api/mouse/drag-by-description"; // Create request object var request = new DragByDescriptionRequest { StartElementDescription = startElementDescription, EndElementDescription = endElementDescription, ImageBase64 = imageBase64, // If null, server will take a screenshot Mechanism = mechanism }; // Serialize request to JSON var jsonContent = JsonSerializer.Serialize(request); var content = new StringContent(jsonContent, Encoding.UTF8, "application/json"); // Send the request var response = await _httpClient.PostAsync(_baseUrl + endpoint, content); // Ensure success status code response.EnsureSuccessStatusCode(); // Deserialize the response var jsonResponse = await response.Content.ReadAsStringAsync(); return JsonSerializer.Deserialize (jsonResponse); } } // Request and response models public class DragByDescriptionRequest { public string StartElementDescription { get; set; } public string EndElementDescription { get; set; } public string ImageBase64 { get; set; } public string Mechanism { get; set; } } public class ActionResponse { public bool Success { get; set; } public string Message { get; set; } public DateTime Timestamp { get; set; } } // Example usage class Program { static async Task Main() { var client = new ToolsServerClient(); try { var result = await client.DragByDescriptionAsync( "the file icon", "the folder icon", mechanism: "screengrasp2" ); Console.WriteLine($"Success: {result.Success}"); Console.WriteLine($"Message: {result.Message}"); Console.WriteLine($"Timestamp: {result.Timestamp}"); } catch (Exception ex) { Console.WriteLine($"Error: {ex.Message}"); } } }
/tools-api/mouse/drag
instead{
"startElementDescription": "the Excel spreadsheet file on the desktop",
"endElementDescription": "the Documents folder"
}
{
"startElementDescription": "the blue circle in the top-left corner of the canvas",
"endElementDescription": "the center of the design area"
}
{
"startElementDescription": "the red button labeled 'Delete'",
"endElementDescription": "the trash bin icon at the bottom right",
"imageBase64": "..."
}