This endpoint allows you to find UI elements on the screen using natural language descriptions. It takes a screenshot of the current screen (if one is not provided) and uses AI vision to identify UI elements matching your description. This is particularly useful for automating interactions with applications where traditional element selectors are not available or reliable.
{
"taskDescription": "string", // Natural language description of the UI element to find
"imageBase64": "string", // Optional - Base64 encoded screenshot image (if not provided, a screenshot will be taken)
"mechanism": "string" // Optional - Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b)
}
| Parameter | Type | Required | Description |
|---|---|---|---|
| taskDescription | string | Yes | A natural language description of the UI element you want to find. For example: "red button", "login button", "username input field", etc. |
| imageBase64 | string | No | Base64 encoded image of the screen. If not provided, a screenshot will be automatically taken. |
| mechanism | string | No | Vision mechanism to use for finding UI elements. Possible values: "screengrasp2", "screengrasp2-low", "screengrasp2-medium", "screengrasp2-high", "llabs", "anthropic-computer-use", "openai-computer-use", "qwen25-vl-72b". Default is "screengrasp2". Learn more about these mechanisms. |
{
"success": boolean, // Whether the operation was successful
"message": "string", // Additional information or error message
"timestamp": "string", // ISO 8601 timestamp of when the response was generated
"x": number, // X coordinate of the found UI element (may be null if not found)
"y": number, // Y coordinate of the found UI element (may be null if not found)
"status": "string", // Status message regarding the element detection
"imageBase64": "string" // Base64 encoded screenshot image that was analyzed
}
| Field | Type | Description |
|---|---|---|
| success | boolean | Indicates whether the operation was successful. |
| message | string | Additional information about the operation, or an error message if the operation failed. |
| timestamp | string | ISO 8601 timestamp of when the response was generated. |
| x | number | null | The X coordinate of the found UI element. Will be null if no element was found. |
| y | number | null | The Y coordinate of the found UI element. Will be null if no element was found. |
| status | string | Status message providing additional information about the element detection process. |
| imageBase64 | string | Base64 encoded image of the screenshot that was analyzed. |
import requests
import json
import base64
from PIL import Image
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def find_ui_element(description, image_base64=None, mechanism="screengrasp2"):
"""
Find UI elements on the screen using a natural language description.
Args:
description (str): Natural language description of the UI element to find
image_base64 (str, optional): Base64 encoded screenshot image (if not provided, a screenshot will be taken)
mechanism (str, optional): Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b)
Returns:
dict: The response from the server
"""
url = "http://localhost:5000/tools-api/screenshot/find-ui-element"
headers = {
"Content-Type": "application/json"
}
data = {
"taskDescription": description,
"mechanism": mechanism
}
if image_base64:
data["imageBase64"] = image_base64
response = requests.post(url, headers=headers, data=json.dumps(data))
return response.json()
def visualize_results(response):
"""
Visualize the elements found in the response by drawing rectangles around them on the screenshot.
Args:
response (dict): The response from the find_ui_element function
"""
if not response.get("success") or "imageBase64" not in response:
print("No screenshot in response or operation failed")
return
# Decode screenshot
img_data = base64.b64decode(response["imageBase64"])
img = Image.open(io.BytesIO(img_data))
# Create figure and axes
fig, ax = plt.subplots(1)
ax.imshow(img)
# Draw rectangles for each element
if response.get("x") is not None and response.get("y") is not None:
rect = patches.Rectangle(
(response["x"], response["y"]),
10,
10,
linewidth=2,
edgecolor='r',
facecolor='none'
)
ax.add_patch(rect)
plt.title(f"Found UI element")
plt.axis('off')
plt.show()
# Example usage
if __name__ == "__main__":
# Find search boxes on the screen
result = find_ui_element(
description="search box",
mechanism="screengrasp2-medium"
)
# Print basic information
print(f"Success: {result['success']}")
print(f"Message: {result['message']}")
print(f"Timestamp: {result['timestamp']}")
print(f"X: {result['x']}")
print(f"Y: {result['y']}")
print(f"Status: {result['status']}")
# Visualize the results
if result["success"] and "imageBase64" in result:
visualize_results(result)
interface FindUiElementRequest {
taskDescription: string;
imageBase64?: string;
mechanism?: string;
}
interface FindUiElementResponse {
success: boolean;
message: string;
timestamp: string;
x: number | null;
y: number | null;
status: string;
imageBase64: string;
}
/**
* Find UI elements on the screen using a natural language description.
*
* @param description - Natural language description of the UI element to find
* @param options - Additional options for the search
* @returns A promise that resolves to the find UI element response
*/
async function findUiElement(
description: string,
options: {
imageBase64?: string;
mechanism?: string;
} = {}
): Promise {
const url = 'http://localhost:5000/tools-api/screenshot/find-ui-element';
const headers: Record = {
'Content-Type': 'application/json'
};
const request: FindUiElementRequest = {
taskDescription: description,
...options
};
const response = await fetch(url, {
method: 'POST',
headers,
body: JSON.stringify(request)
});
return await response.json();
}
/**
* Generate HTML to visualize the elements found by drawing rectangles on the screenshot.
*
* @param response - The response from the findUiElement function
* @returns HTML string with the visualization
*/
function generateVisualization(response: FindUiElementResponse): string {
if (!response.success || !response.imageBase64) {
return 'No screenshot in response or operation failed
';
}
const containerStyle = 'position: relative; max-width: 100%; overflow: hidden;';
let html = '';
// Add the screenshot as background
html += '
';
// Add rectangles for each element
if (response.x !== null && response.y !== null) {
const rectStyle = 'position: absolute; left: ' + response.x + 'px; top: ' + response.y +
'px; width: 10px; height: 10px; border: 2px solid red; background-color: rgba(255, 0, 0, 0.3);';
html += '';
}
html += '';
return html;
}
// Example usage
(async () => {
try {
// Find buttons on the screen
const result = await findUiElement(
'submit button',
{
mechanism: 'screengrasp2-medium'
}
);
// Print basic information
console.log(`Success: ${result.success}`);
console.log(`Message: ${result.message}`);
console.log(`Timestamp: ${result.timestamp}`);
console.log(`X: ${result.x}`);
console.log(`Y: ${result.y}`);
console.log(`Status: ${result.status}`);
// Create visualization
if (result.success && result.imageBase64) {
const visualization = generateVisualization(result);
// In a browser environment, you could add this to the DOM:
// document.getElementById('visualization').innerHTML = visualization;
console.log('Visualization HTML generated');
}
} catch (error) {
console.error('Error finding UI element:', error);
}
})();
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
public class FindUiElementRequest
{
public string TaskDescription { get; set; }
public string ImageBase64 { get; set; }
public string Mechanism { get; set; }
}
public class FindUiElementResponse
{
public bool Success { get; set; }
public string Message { get; set; }
public string Timestamp { get; set; }
public int? X { get; set; }
public int? Y { get; set; }
public string Status { get; set; }
public string ImageBase64 { get; set; }
}
public class ToolsServerClient
{
private readonly HttpClient _httpClient;
public ToolsServerClient()
{
_httpClient = new HttpClient();
_httpClient.BaseAddress = new Uri("http://localhost:5000/");
}
///
/// Find UI elements on the screen using a natural language description.
///
/// Natural language description of the UI element to find
/// Base64 encoded screenshot image (if not provided, a screenshot will be taken)
/// Vision mechanism to use (screengrasp2, screengrasp2-low, screengrasp2-medium, screengrasp2-high, llabs, anthropic-computer-use, openai-computer-use, qwen25-vl-72b)
/// Response containing the found UI elements
public async Task FindUiElementAsync(
string description,
string imageBase64 = null,
string mechanism = "screengrasp2")
{
var request = new FindUiElementRequest
{
TaskDescription = description,
ImageBase64 = imageBase64,
Mechanism = mechanism
};
var json = JsonSerializer.Serialize(request);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync("tools-api/screenshot/find-ui-element", content);
response.EnsureSuccessStatusCode();
var responseBody = await response.Content.ReadAsStringAsync();
return JsonSerializer.Deserialize(responseBody, new JsonSerializerOptions
{
PropertyNameCaseInsensitive = true
});
}
///
/// Save the screenshot from the response to a file and generate an image with rectangles
/// around the found UI elements.
///
/// The response from FindUiElementAsync
/// Path to save the visualization image
/// Path to the saved image
public string SaveVisualization(FindUiElementResponse response, string outputPath)
{
if (!response.Success || string.IsNullOrEmpty(response.ImageBase64))
{
throw new InvalidOperationException("No screenshot in response or operation failed");
}
// Convert base64 to image
var imageBytes = Convert.FromBase64String(response.ImageBase64);
using var ms = new MemoryStream(imageBytes);
using var originalImage = Image.FromStream(ms);
// Create a copy to draw on
using var image = new Bitmap(originalImage);
using var graphics = Graphics.FromImage(image);
// Draw rectangles for each element
using var pen = new Pen(Color.Red, 2);
if (response.X.HasValue && response.Y.HasValue)
{
// Draw rectangle
graphics.DrawRectangle(pen, response.X.Value, response.Y.Value, 10, 10);
}
// Save the image
image.Save(outputPath);
return outputPath;
}
}
// Example usage
class Program
{
static async Task Main(string[] args)
{
var client = new ToolsServerClient();
try
{
// Find buttons on the screen
var result = await client.FindUiElementAsync(
description: "login button",
mechanism: "screengrasp2-medium"
);
// Print basic information
Console.WriteLine($"Success: {result.Success}");
Console.WriteLine($"Message: {result.Message}");
Console.WriteLine($"Timestamp: {result.Timestamp}");
Console.WriteLine($"X: {result.X}");
Console.WriteLine($"Y: {result.Y}");
Console.WriteLine($"Status: {result.Status}");
// Save visualization
if (result.Success && !string.IsNullOrEmpty(result.ImageBase64))
{
var outputPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "ui_elements.png");
client.SaveVisualization(result, outputPath);
Console.WriteLine($"Visualization saved to: {outputPath}");
}
}
catch (Exception ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
This example searches for a button with the text "Submit" on the screen:
// Request
POST /tools-api/screenshot/find-ui-element
{
"taskDescription": "submit button",
"mechanism": "screengrasp2-medium"
}
// Response
{
"success": true,
"message": "Found UI element",
"timestamp": "2023-03-09T14:30:00.000Z",
"x": 850,
"y": 520,
"status": "Element found",
"imageBase64": "base64_encoded_screenshot..."
}
This example searches for a username input field on the screen:
// Request
POST /tools-api/screenshot/find-ui-element
{
"taskDescription": "username input field",
"imageBase64": "base64_encoded_screenshot..."
}
// Response
{
"success": true,
"message": "Found UI element",
"timestamp": "2023-03-09T14:30:00.000Z",
"x": 480,
"y": 320,
"status": "Element found",
"imageBase64": "base64_encoded_screenshot..."
}
This example restricts the search to a specific area of the screen:
// Request
POST /tools-api/screenshot/find-ui-element
{
"taskDescription": "settings icon",
"imageBase64": "base64_encoded_screenshot...",
"mechanism": "llabs"
}
// Response
{
"success": true,
"message": "Found UI element",
"timestamp": "2023-03-09T14:30:00.000Z",
"x": 250,
"y": 25,
"status": "Element found",
"imageBase64": "base64_encoded_screenshot..."
}