This endpoint allows you to find UI elements on the screen using natural language descriptions. It takes a screenshot of the current screen (if one is not provided) and uses AI vision to identify UI elements matching your description. This is particularly useful for automating interactions with applications where traditional element selectors are not available or reliable.
{ "taskDescription": "string", // Natural language description of the UI element to find "imageBase64": "string", // Optional - Base64 encoded screenshot image (if not provided, a screenshot will be taken) "mechanism": "string" // Optional - Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b) }
Parameter | Type | Required | Description |
---|---|---|---|
taskDescription | string | Yes | A natural language description of the UI element you want to find. For example: "red button", "login button", "username input field", etc. |
imageBase64 | string | No | Base64 encoded image of the screen. If not provided, a screenshot will be automatically taken. |
mechanism | string | No | Vision mechanism to use for finding UI elements. Possible values: "screengrasp2", "screengrasp2-low", "screengrasp2-medium", "screengrasp2-high", "llabs", "anthropic-computer-use", "openai-computer-use", "qwen25-vl-72b". Default is "screengrasp2". Learn more about these mechanisms. |
{ "success": boolean, // Whether the operation was successful "message": "string", // Additional information or error message "timestamp": "string", // ISO 8601 timestamp of when the response was generated "x": number, // X coordinate of the found UI element (may be null if not found) "y": number, // Y coordinate of the found UI element (may be null if not found) "status": "string", // Status message regarding the element detection "imageBase64": "string" // Base64 encoded screenshot image that was analyzed }
Field | Type | Description |
---|---|---|
success | boolean | Indicates whether the operation was successful. |
message | string | Additional information about the operation, or an error message if the operation failed. |
timestamp | string | ISO 8601 timestamp of when the response was generated. |
x | number | null | The X coordinate of the found UI element. Will be null if no element was found. |
y | number | null | The Y coordinate of the found UI element. Will be null if no element was found. |
status | string | Status message providing additional information about the element detection process. |
imageBase64 | string | Base64 encoded image of the screenshot that was analyzed. |
import requests import json import base64 from PIL import Image import io import matplotlib.pyplot as plt import matplotlib.patches as patches def find_ui_element(description, image_base64=None, mechanism="screengrasp2"): """ Find UI elements on the screen using a natural language description. Args: description (str): Natural language description of the UI element to find image_base64 (str, optional): Base64 encoded screenshot image (if not provided, a screenshot will be taken) mechanism (str, optional): Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b) Returns: dict: The response from the server """ url = "http://localhost:5000/tools-api/screenshot/find-ui-element" headers = { "Content-Type": "application/json" } data = { "taskDescription": description, "mechanism": mechanism } if image_base64: data["imageBase64"] = image_base64 response = requests.post(url, headers=headers, data=json.dumps(data)) return response.json() def visualize_results(response): """ Visualize the elements found in the response by drawing rectangles around them on the screenshot. Args: response (dict): The response from the find_ui_element function """ if not response.get("success") or "imageBase64" not in response: print("No screenshot in response or operation failed") return # Decode screenshot img_data = base64.b64decode(response["imageBase64"]) img = Image.open(io.BytesIO(img_data)) # Create figure and axes fig, ax = plt.subplots(1) ax.imshow(img) # Draw rectangles for each element if response.get("x") is not None and response.get("y") is not None: rect = patches.Rectangle( (response["x"], response["y"]), 10, 10, linewidth=2, edgecolor='r', facecolor='none' ) ax.add_patch(rect) plt.title(f"Found UI element") plt.axis('off') plt.show() # Example usage if __name__ == "__main__": # Find search boxes on the screen result = find_ui_element( description="search box", mechanism="screengrasp2-medium" ) # Print basic information print(f"Success: {result['success']}") print(f"Message: {result['message']}") print(f"Timestamp: {result['timestamp']}") print(f"X: {result['x']}") print(f"Y: {result['y']}") print(f"Status: {result['status']}") # Visualize the results if result["success"] and "imageBase64" in result: visualize_results(result)
interface FindUiElementRequest { taskDescription: string; imageBase64?: string; mechanism?: string; } interface FindUiElementResponse { success: boolean; message: string; timestamp: string; x: number | null; y: number | null; status: string; imageBase64: string; } /** * Find UI elements on the screen using a natural language description. * * @param description - Natural language description of the UI element to find * @param options - Additional options for the search * @returns A promise that resolves to the find UI element response */ async function findUiElement( description: string, options: { imageBase64?: string; mechanism?: string; } = {} ): Promise{ const url = 'http://localhost:5000/tools-api/screenshot/find-ui-element'; const headers: Record = { 'Content-Type': 'application/json' }; const request: FindUiElementRequest = { taskDescription: description, ...options }; const response = await fetch(url, { method: 'POST', headers, body: JSON.stringify(request) }); return await response.json(); } /** * Generate HTML to visualize the elements found by drawing rectangles on the screenshot. * * @param response - The response from the findUiElement function * @returns HTML string with the visualization */ function generateVisualization(response: FindUiElementResponse): string { if (!response.success || !response.imageBase64) { return ' No screenshot in response or operation failed
'; } const containerStyle = 'position: relative; max-width: 100%; overflow: hidden;'; let html = ``; // Add the screenshot as background html += `'; return html; } // Example usage (async () => { try { // Find buttons on the screen const result = await findUiElement( 'submit button', { mechanism: 'screengrasp2-medium' } ); // Print basic information console.log(`Success: ${result.success}`); console.log(`Message: ${result.message}`); console.log(`Timestamp: ${result.timestamp}`); console.log(`X: ${result.x}`); console.log(`Y: ${result.y}`); console.log(`Status: ${result.status}`); // Create visualization if (result.success && result.imageBase64) { const visualization = generateVisualization(result); // In a browser environment, you could add this to the DOM: // document.getElementById('visualization').innerHTML = visualization; console.log('Visualization HTML generated'); } } catch (error) { console.error('Error finding UI element:', error); } })();`; // Add rectangles for each element if (response.x !== null && response.y !== null) { const rectStyle = 'position: absolute; left: ' + response.x + 'px; top: ' + response.y + 'px; width: 10px; height: 10px; border: 2px solid red; background-color: rgba(255, 0, 0, 0.3);'; html += ``; } html += '
using System; using System.Collections.Generic; using System.Drawing; using System.IO; using System.Net.Http; using System.Net.Http.Headers; using System.Text; using System.Text.Json; using System.Threading.Tasks; public class FindUiElementRequest { public string TaskDescription { get; set; } public string ImageBase64 { get; set; } public string Mechanism { get; set; } } public class FindUiElementResponse { public bool Success { get; set; } public string Message { get; set; } public string Timestamp { get; set; } public int? X { get; set; } public int? Y { get; set; } public string Status { get; set; } public string ImageBase64 { get; set; } } public class ToolsServerClient { private readonly HttpClient _httpClient; public ToolsServerClient() { _httpClient = new HttpClient(); _httpClient.BaseAddress = new Uri("http://localhost:5000/"); } ////// Find UI elements on the screen using a natural language description. /// /// Natural language description of the UI element to find /// Base64 encoded screenshot image (if not provided, a screenshot will be taken) /// Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b) ///Response containing the found UI elements public async TaskFindUiElementAsync( string description, string imageBase64 = null, string mechanism = "screengrasp2") { var request = new FindUiElementRequest { TaskDescription = description, ImageBase64 = imageBase64, Mechanism = mechanism }; var json = JsonSerializer.Serialize(request); var content = new StringContent(json, Encoding.UTF8, "application/json"); var response = await _httpClient.PostAsync("tools-api/screenshot/find-ui-element", content); response.EnsureSuccessStatusCode(); var responseBody = await response.Content.ReadAsStringAsync(); return JsonSerializer.Deserialize (responseBody, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); } /// /// Save the screenshot from the response to a file and generate an image with rectangles /// around the found UI elements. /// /// The response from FindUiElementAsync /// Path to save the visualization image ///Path to the saved image public string SaveVisualization(FindUiElementResponse response, string outputPath) { if (!response.Success || string.IsNullOrEmpty(response.ImageBase64)) { throw new InvalidOperationException("No screenshot in response or operation failed"); } // Convert base64 to image var imageBytes = Convert.FromBase64String(response.ImageBase64); using var ms = new MemoryStream(imageBytes); using var originalImage = Image.FromStream(ms); // Create a copy to draw on using var image = new Bitmap(originalImage); using var graphics = Graphics.FromImage(image); // Draw rectangles for each element using var pen = new Pen(Color.Red, 2); if (response.X.HasValue && response.Y.HasValue) { // Draw rectangle graphics.DrawRectangle(pen, response.X.Value, response.Y.Value, 10, 10); } // Save the image image.Save(outputPath); return outputPath; } } // Example usage class Program { static async Task Main(string[] args) { var client = new ToolsServerClient(); try { // Find buttons on the screen var result = await client.FindUiElementAsync( description: "login button", mechanism: "screengrasp2-medium" ); // Print basic information Console.WriteLine($"Success: {result.Success}"); Console.WriteLine($"Message: {result.Message}"); Console.WriteLine($"Timestamp: {result.Timestamp}"); Console.WriteLine($"X: {result.X}"); Console.WriteLine($"Y: {result.Y}"); Console.WriteLine($"Status: {result.Status}"); // Save visualization if (result.Success && !string.IsNullOrEmpty(result.ImageBase64)) { var outputPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "ui_elements.png"); client.SaveVisualization(result, outputPath); Console.WriteLine($"Visualization saved to: {outputPath}"); } } catch (Exception ex) { Console.WriteLine($"Error: {ex.Message}"); } } }
This example searches for a button with the text "Submit" on the screen:
// Request POST /tools-api/screenshot/find-ui-element { "taskDescription": "submit button", "mechanism": "screengrasp2-medium" } // Response { "success": true, "message": "Found UI element", "timestamp": "2023-03-09T14:30:00.000Z", "x": 850, "y": 520, "status": "Element found", "imageBase64": "base64_encoded_screenshot..." }
This example searches for a username input field on the screen:
// Request POST /tools-api/screenshot/find-ui-element { "taskDescription": "username input field", "imageBase64": "base64_encoded_screenshot..." } // Response { "success": true, "message": "Found UI element", "timestamp": "2023-03-09T14:30:00.000Z", "x": 480, "y": 320, "status": "Element found", "imageBase64": "base64_encoded_screenshot..." }
This example restricts the search to a specific area of the screen:
// Request POST /tools-api/screenshot/find-ui-element { "taskDescription": "settings icon", "imageBase64": "base64_encoded_screenshot...", "mechanism": "llabs" } // Response { "success": true, "message": "Found UI element", "timestamp": "2023-03-09T14:30:00.000Z", "x": 250, "y": 25, "status": "Element found", "imageBase64": "base64_encoded_screenshot..." }