feat: add CloudCLI computer use semantics, desktop helper packaging, and permission onboarding

This commit is contained in:
Simos Mikelatos
2026-06-19 12:09:55 +00:00
parent a35200f340
commit 1726705459
37 changed files with 3036 additions and 426 deletions

View File

@@ -0,0 +1,67 @@
import {
captureScreenshot,
executor,
type ExecutorTarget,
} from '@/modules/computer-use/computer-executor.js';
import type { RawActionResult, RawComputerAction, RawActionTarget } from '@/modules/computer-use/actions/raw-action-types.js';
const DEFAULT_WAIT_MS = 1000;
const MAX_WAIT_MS = 10_000;
function normalizeWaitMs(ms: number | undefined): number {
if (ms === undefined) {
return DEFAULT_WAIT_MS;
}
if (!Number.isFinite(ms)) {
throw new Error('Computer Use wait duration must be a finite number.');
}
return Math.trunc(Math.max(0, Math.min(ms, MAX_WAIT_MS)));
}
async function snapshot(target: RawActionTarget): Promise<RawActionResult> {
const { dataUrl, size } = await captureScreenshot();
return { screenshotDataUrl: dataUrl, displaySize: size || target.displaySize };
}
export async function runRawComputerAction(
action: RawComputerAction,
target: RawActionTarget,
): Promise<RawActionResult> {
const executorTarget: ExecutorTarget = {
displaySize: target.displaySize,
};
switch (action.type) {
case 'screenshot':
return snapshot(target);
case 'cursor_position': {
const position = await executor.cursorPosition(executorTarget);
return { ...(await snapshot(target)), position, cursor: position };
}
case 'mouse_move':
await executor.moveTo(executorTarget, action.point);
return { ...(await snapshot(target)), cursor: action.point };
case 'click':
await executor.click(executorTarget, action.button, action.point, action.double === true);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'drag':
await executor.drag(executorTarget, action.from, action.to, action.button ?? 'left');
return { ...(await snapshot(target)), cursor: action.to };
case 'type':
await executor.type(action.text);
return snapshot(target);
case 'key':
await executor.pressChord(action.key);
return snapshot(target);
case 'scroll':
await executor.scroll(executorTarget, action.direction, action.amount ?? 3, action.point);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'wait':
await new Promise((resolve) => setTimeout(resolve, normalizeWaitMs(action.ms)));
return snapshot(target);
default: {
const exhaustive: never = action;
throw new Error(`Unsupported computer action: ${(exhaustive as { type?: string }).type || 'unknown'}`);
}
}
}

View File

@@ -0,0 +1,28 @@
import type {
ClickButton,
DisplaySize,
Point,
ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
export type RawComputerAction =
| { type: 'screenshot' }
| { type: 'cursor_position' }
| { type: 'mouse_move'; point: Point }
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
| { type: 'type'; text: string }
| { type: 'key'; key: string }
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
| { type: 'wait'; ms?: number };
export type RawActionTarget = {
displaySize: DisplaySize | null;
};
export type RawActionResult = {
screenshotDataUrl?: string | null;
displaySize?: DisplaySize | null;
cursor?: Point | null;
position?: Point | null;
};

View File

@@ -0,0 +1,450 @@
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import {
captureScreenshot,
executor,
type ClickButton,
type ExecutorTarget,
type Point,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import type { SemanticAdapter } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import { createMacOsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/macos/macos-semantic-adapter.js';
import { createWindowsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/windows/windows-semantic-adapter.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import { semanticSessionStore } from '@/modules/computer-use/semantics/semantic-session-store.js';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const execFileAsync = promisify(execFile);
const MAX_APP_STATE_ELEMENTS = 250;
let helperAdapter: SemanticAdapter | null | undefined;
function readString(value: unknown): string {
return typeof value === 'string' ? value.trim() : '';
}
function readNumber(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
}
function readButton(value: unknown): ClickButton {
return value === 'right' || value === 'middle' ? value : 'left';
}
function readClickCount(value: unknown): number {
const count = readNumber(value);
if (count === undefined) {
return 1;
}
return Math.max(1, Math.min(5, Math.trunc(count)));
}
function readDirection(value: unknown): ScrollDirection {
return value === 'up' || value === 'left' || value === 'right' ? value : 'down';
}
function readSessionId(input: Record<string, unknown>): string {
return readString(input.sessionId) || 'default';
}
function centerOf(element: SemanticElement): Point | null {
const bounds = element.bounds;
if (!bounds) {
return null;
}
return {
x: Math.round(bounds.x + bounds.width / 2),
y: Math.round(bounds.y + bounds.height / 2),
};
}
function getCachedElement(sessionId: string, app: string, index: string, stateId?: string): SemanticElement | null {
return semanticSessionStore.getElement(sessionId, app, index, stateId);
}
function getPoint(input: Record<string, unknown>, sessionId: string, app: string): Point | undefined {
const x = readNumber(input.x);
const y = readNumber(input.y);
if (x !== undefined && y !== undefined) {
return { x, y };
}
const elementIndex = readString(input.element_index);
if (!elementIndex) {
return undefined;
}
const element = getCachedElement(sessionId, app, elementIndex, readString(input.stateId) || undefined);
return element ? centerOf(element) || undefined : undefined;
}
function getHelperAdapter(): SemanticAdapter | null {
if (helperAdapter !== undefined) {
return helperAdapter;
}
if (process.platform !== 'darwin' && process.platform !== 'win32') {
helperAdapter = null;
return helperAdapter;
}
const resolution = resolveSemanticHelper();
if (!resolution.available) {
helperAdapter = null;
return helperAdapter;
}
helperAdapter = process.platform === 'darwin'
? createMacOsSemanticAdapter()
: createWindowsSemanticAdapter();
return helperAdapter;
}
function shouldFallbackFromHelper(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /not implemented|unavailable|not found|does not exist/i.test(message);
}
async function withHelperState(
sessionId: string,
operation: (adapter: SemanticAdapter) => Promise<SemanticAppState>,
): Promise<SemanticAppState | null> {
const adapter = getHelperAdapter();
if (!adapter) {
return null;
}
try {
return semanticSessionStore.save(sessionId, await operation(adapter));
} catch (error) {
if (shouldFallbackFromHelper(error)) {
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
return null;
}
throw error;
}
}
async function run(command: string, args: string[], timeout = 5000): Promise<string> {
const { stdout } = await execFileAsync(command, args, {
timeout,
windowsHide: true,
maxBuffer: 1024 * 1024 * 4,
});
return stdout;
}
async function listMacApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'tell application "System Events"',
'set appRows to {}',
'repeat with p in (application processes whose background only is false)',
'set end of appRows to (name of p as text)',
'end repeat',
'return appRows',
'end tell',
].join('\n');
const output = await run('osascript', ['-e', script]);
return output.split(', ')
.map((name) => name.trim())
.filter(Boolean)
.map((name) => ({ name, running: true }));
}
async function listWindowsApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'Get-Process | Where-Object { $_.MainWindowTitle } |',
'Select-Object ProcessName, Id, MainWindowTitle | ConvertTo-Json -Depth 3',
].join(' ');
const output = await run('powershell.exe', ['-NoProfile', '-Command', script]);
const parsed = JSON.parse(output || '[]');
const rows = Array.isArray(parsed) ? parsed : [parsed];
return rows.map((row) => ({
name: row.ProcessName,
pid: row.Id,
windowTitle: row.MainWindowTitle,
running: true,
}));
}
async function listLinuxApps(): Promise<Array<Record<string, unknown>>> {
try {
const output = await run('wmctrl', ['-lx']);
return output.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.map((line) => {
const parts = line.split(/\s+/);
return {
windowId: parts[0],
desktop: parts[1],
host: parts[2],
className: parts[3],
windowTitle: parts.slice(4).join(' '),
running: true,
};
});
} catch {
const output = await run('ps', ['-eo', 'comm=']);
return [...new Set(output.split(/\r?\n/).map((name) => name.trim()).filter(Boolean))]
.slice(0, 200)
.map((name) => ({ name, running: true }));
}
}
async function listApps(): Promise<Array<Record<string, unknown>>> {
if (process.platform === 'darwin') {
return listMacApps();
}
if (process.platform === 'win32') {
return listWindowsApps();
}
return listLinuxApps();
}
async function macAccessibilityTree(app: string): Promise<SemanticElement[]> {
const escapedApp = app.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
const script = `
on safeText(v)
try
return v as text
on error
return ""
end try
end safeText
on emitElement(e, depth, maxDepth, counter)
if depth > maxDepth then return {}
set rows to {}
try
set roleText to my safeText(role of e)
on error
set roleText to "element"
end try
try
set titleText to my safeText(title of e)
on error
set titleText to ""
end try
try
set valueText to my safeText(value of e)
on error
set valueText to ""
end try
try
set posValue to position of e
set sizeValue to size of e
set boundsText to ((item 1 of posValue) as text) & "," & ((item 2 of posValue) as text) & "," & ((item 1 of sizeValue) as text) & "," & ((item 2 of sizeValue) as text)
on error
set boundsText to ""
end try
set end of rows to ((counter as text) & tab & roleText & tab & titleText & tab & valueText & tab & boundsText)
if counter > ${MAX_APP_STATE_ELEMENTS} then return rows
try
repeat with childElement in UI elements of e
set childRows to my emitElement(childElement, depth + 1, maxDepth, counter + (count of rows))
set rows to rows & childRows
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then return rows
end repeat
end try
return rows
end emitElement
tell application "System Events"
if not (exists process "${escapedApp}") then error "App is not running: ${escapedApp}"
tell process "${escapedApp}"
set rows to {}
repeat with w in windows
set rows to rows & my emitElement(w, 0, 4, (count of rows) + 1)
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then exit repeat
end repeat
return rows
end tell
end tell
`;
const output = await run('osascript', ['-e', script], 10000);
return output.split(/\r?\n|, /)
.map((line) => line.trim())
.filter(Boolean)
.map((line, index) => {
const [rawIndex, role, title, value, boundsText] = line.split('\t');
const boundsParts = (boundsText || '').split(',').map((part) => Number.parseFloat(part));
const hasBounds = boundsParts.length === 4 && boundsParts.every(Number.isFinite);
return {
index: rawIndex || String(index + 1),
role: role || 'element',
title: title || undefined,
value: value || undefined,
bounds: hasBounds
? { x: boundsParts[0], y: boundsParts[1], width: boundsParts[2], height: boundsParts[3] }
: undefined,
};
});
}
async function getAccessibilityTree(app: string): Promise<{ elements: SemanticElement[]; message?: string }> {
if (process.platform === 'darwin') {
try {
return { elements: await macAccessibilityTree(app) };
} catch (error) {
return { elements: [], message: error instanceof Error ? error.message : String(error) };
}
}
return {
elements: [],
message: 'Native accessibility tree capture is not implemented for this platform yet.',
};
}
async function getAppState(sessionId: string, app: string): Promise<SemanticAppState> {
if (!app) {
throw new Error('app is required.');
}
const helperState = await withHelperState(sessionId, (adapter) => adapter.getAppState({ sessionId, app }));
if (helperState) {
return helperState;
}
const screenshot = await captureScreenshot();
const tree = await getAccessibilityTree(app);
const state: SemanticAppState = {
stateId: semanticSessionStore.createStateId(),
app,
platform: process.platform,
screenshotDataUrl: screenshot.dataUrl,
displaySize: screenshot.size,
elements: tree.elements,
accessibilityTree: tree.elements,
message: tree.message,
};
return semanticSessionStore.save(sessionId, state);
}
async function targetFor(sessionId: string, app: string, stateId?: string): Promise<ExecutorTarget> {
const cached = semanticSessionStore.getState(sessionId, app, stateId);
return { displaySize: cached?.displaySize || (await captureScreenshot()).size };
}
export const computerSemanticsService = {
async callTool(name: string, input: Record<string, unknown>): Promise<unknown> {
const sessionId = readSessionId(input);
switch (name) {
case 'list_apps': {
const adapter = getHelperAdapter();
if (adapter) {
try {
return { apps: await adapter.listApps(), platform: process.platform };
} catch (error) {
if (!shouldFallbackFromHelper(error)) {
throw error;
}
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
}
}
return { apps: await listApps(), platform: process.platform };
}
case 'get_app_state':
return getAppState(sessionId, readString(input.app));
case 'click': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.clickElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('click requires x/y or an element_index from computer_get_app_state.');
}
const target = await targetFor(sessionId, app, stateId);
const button = readButton(input.mouse_button ?? input.mouseButton);
const clickCount = readClickCount(input.click_count ?? input.clickCount);
for (let index = 0; index < clickCount; index += 1) {
await executor.click(target, button, point, false);
}
return getAppState(sessionId, app);
}
case 'drag': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.drag({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const fromX = readNumber(input.from_x);
const fromY = readNumber(input.from_y);
const toX = readNumber(input.to_x);
const toY = readNumber(input.to_y);
if (fromX === undefined || fromY === undefined || toX === undefined || toY === undefined) {
throw new Error('drag requires from_x/from_y/to_x/to_y.');
}
await executor.drag(await targetFor(sessionId, app, stateId), { x: fromX, y: fromY }, { x: toX, y: toY }, readButton(input.mouse_button ?? input.mouseButton));
return getAppState(sessionId, app);
}
case 'scroll': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.scrollElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('scroll requires x/y or an element_index from computer_get_app_state.');
}
await executor.scroll(await targetFor(sessionId, app, stateId), readDirection(input.direction), readNumber(input.pages) ?? 1, point);
return getAppState(sessionId, app);
}
case 'type_text': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.typeText({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.type(readString(input.text));
return getAppState(sessionId, app);
}
case 'press_key': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.pressKey({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.pressChord(readString(input.key));
return getAppState(sessionId, app);
}
case 'set_value': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.setValue({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('set_value requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'left', point, false);
await executor.pressChord(process.platform === 'darwin' ? 'cmd+a' : 'ctrl+a');
await executor.type(readString(input.value));
return getAppState(sessionId, app);
}
case 'perform_secondary_action': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.performSecondaryAction({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('perform_secondary_action requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'right', point, false);
return getAppState(sessionId, app);
}
default:
throw new Error(`Unknown semantic Computer Use tool: ${name}`);
}
},
};

View File

@@ -1,6 +1,7 @@
import express from 'express';
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
import { semanticOperationForMcpTool } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const router = express.Router();
@@ -36,6 +37,22 @@ function point(input: Record<string, unknown>): { x: number; y: number } | undef
: undefined;
}
function requireNumber(input: Record<string, unknown>, name: string): number {
const value = input[name];
if (typeof value !== 'number' || !Number.isFinite(value)) {
throw new Error(`${name} is required and must be a finite number.`);
}
return value;
}
function requirePoint(input: Record<string, unknown>): { x: number; y: number } {
return { x: requireNumber(input, 'x'), y: requireNumber(input, 'y') };
}
function requireNamedPoint(input: Record<string, unknown>, xName: string, yName: string): { x: number; y: number } {
return { x: requireNumber(input, xName), y: requireNumber(input, yName) };
}
router.use((req, res, next) => {
const expected = computerUseService.getMcpToken();
const token = readBearerToken(req.headers.authorization) || String(req.headers['x-computer-use-mcp-token'] || '');
@@ -49,17 +66,18 @@ router.use((req, res, next) => {
router.post('/tools/:toolName', async (req, res) => {
try {
const input = (req.body && typeof req.body === 'object' ? req.body : {}) as Record<string, unknown>;
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : '';
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : undefined;
const toolName = req.params.toolName;
const semanticOperation = semanticOperationForMcpTool(toolName);
let result: unknown;
if (semanticOperation) {
result = await computerUseService.callSemanticTool(semanticOperation, input);
res.json({ success: true, data: result });
return;
}
switch (toolName) {
case 'computer_create_session':
result = await computerUseService.createAgentSession();
break;
case 'computer_list_sessions':
result = await computerUseService.listAgentSessions();
break;
case 'computer_screenshot':
result = await computerUseService.agentScreenshot(sessionId);
break;
@@ -67,28 +85,23 @@ router.post('/tools/:toolName', async (req, res) => {
result = await computerUseService.agentCursorPosition(sessionId);
break;
case 'computer_mouse_move':
result = await computerUseService.agentMouseMove(sessionId, point(input) || { x: 0, y: 0 });
result = await computerUseService.agentMouseMove(sessionId, requirePoint(input));
break;
case 'computer_left_click':
result = await computerUseService.agentClick(sessionId, 'left', point(input));
case 'computer_click':
result = await computerUseService.agentUnifiedClick(sessionId, {
button: toButton(input.mouseButton ?? input.mouse_button ?? input.button),
point: point(input),
clickCount: typeof input.clickCount === 'number'
? input.clickCount
: typeof input.click_count === 'number'
? input.click_count
: 1,
});
break;
case 'computer_right_click':
result = await computerUseService.agentClick(sessionId, 'right', point(input));
break;
case 'computer_middle_click':
result = await computerUseService.agentClick(sessionId, 'middle', point(input));
break;
case 'computer_double_click':
result = await computerUseService.agentClick(sessionId, toButton(input.button), point(input), true);
break;
case 'computer_left_click_drag': {
const from = typeof input.startX === 'number' && typeof input.startY === 'number'
? { x: input.startX, y: input.startY }
: { x: 0, y: 0 };
const to = typeof input.endX === 'number' && typeof input.endY === 'number'
? { x: input.endX, y: input.endY }
: { x: 0, y: 0 };
result = await computerUseService.agentDrag(sessionId, from, to, 'left');
case 'computer_drag': {
const from = requireNamedPoint(input, 'startX', 'startY');
const to = requireNamedPoint(input, 'endX', 'endY');
result = await computerUseService.agentDrag(sessionId, from, to, toButton(input.mouseButton ?? input.mouse_button ?? input.button));
break;
}
case 'computer_type':

View File

@@ -56,43 +56,34 @@ router.get('/status', async (_req, res) => {
}
});
router.get('/settings', async (_req, res) => {
router.get('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
res.json({ success: true, data: { settings: await computerUseService.getSettings() } });
} catch (error) {
res.status(500).json({
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to load Computer Use settings.',
});
}
});
router.put('/settings', async (req, res) => {
router.put('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const settings = await computerUseService.updateSettings(req.body || {});
res.json({ success: true, data: { settings } });
} catch (error) {
res.status(400).json({
res.status(getErrorStatusCode(error, 400)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to save Computer Use settings.',
});
}
});
router.post('/agent-tools/register', async (_req, res) => {
try {
const result = await computerUseService.registerAgentMcp();
res.status(201).json({ success: true, data: result });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to register Computer Use MCP.',
});
}
});
router.post('/runtime/install', async (_req, res) => {
router.post('/runtime/install', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const result = await computerUseService.installRuntime();
res.status(result.success ? 200 : 500).json({
success: result.success,
@@ -100,7 +91,7 @@ router.post('/runtime/install', async (_req, res) => {
error: result.success ? undefined : result.message,
});
} catch (error) {
res.status(500).json({
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to install Computer Use runtime.',
});
@@ -118,18 +109,6 @@ router.get('/sessions', async (req: AuthenticatedRequest, res) => {
}
});
router.post('/sessions', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.createSession(requireUser(req));
res.status(session.status === 'unavailable' ? 202 : 201).json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to create Computer Use session.',
});
}
});
router.post('/sessions/:sessionId/screenshot', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userScreenshot(requireUser(req), readParam(req.params.sessionId));
@@ -169,18 +148,6 @@ router.post('/sessions/:sessionId/click', async (req: AuthenticatedRequest, res)
}
});
router.post('/sessions/:sessionId/type', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userType(requireUser(req), readParam(req.params.sessionId), String(req.body?.text || ''));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to type text.',
});
}
});
router.post('/sessions/:sessionId/press-key', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userPressKey(requireUser(req), readParam(req.params.sessionId), String(req.body?.key || ''));

View File

@@ -7,14 +7,16 @@ import { appConfigDb } from '@/modules/database/repositories/app-config.js';
import { providerMcpService } from '@/modules/providers/services/mcp.service.js';
import { getModuleDir } from '@/utils/runtime-paths.js';
import {
executor,
captureScreenshot as captureScreenshotRuntime,
getRuntimeReadiness as getExecutorReadiness,
type Point,
type ClickButton,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import { runRawComputerAction } from '@/modules/computer-use/actions/raw-action-dispatcher.js';
import type { RawComputerAction } from '@/modules/computer-use/actions/raw-action-types.js';
import { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';
import { computerSemanticsService } from '@/modules/computer-use/computer-semantics.service.js';
import { semanticOperationNames } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const __dirname = getModuleDir(import.meta.url);
const IS_PLATFORM = process.env.VITE_IS_PLATFORM === 'true';
@@ -22,9 +24,6 @@ const MAX_SESSIONS_PER_OWNER = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE
const SESSION_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_SESSION_TTL_MS || String(30 * 60 * 1000), 10);
const COMPUTER_USE_SETTINGS_KEY = 'computer_use_settings';
const COMPUTER_USE_MCP_TOKEN_KEY = 'computer_use_mcp_token';
const DEFAULT_AGENT_WAIT_MS = 1000;
const MAX_AGENT_WAIT_MS = 10_000;
type ComputerUseRuntime = 'cloud' | 'local';
type ComputerUseSessionStatus = 'ready' | 'stopped' | 'unavailable';
@@ -61,7 +60,6 @@ type ComputerUseOwner = {
type ComputerUseSettings = {
enabled: boolean;
agentToolsEnabled: boolean;
};
type RuntimeReadiness = {
@@ -79,7 +77,6 @@ let lastInstallMessage: string | null = null;
const DEFAULT_SETTINGS: ComputerUseSettings = {
enabled: false,
agentToolsEnabled: false,
};
const AGENT_OWNER_ID = 'agent';
const MCP_SERVER_NAME = 'cloudcli-computer-use';
@@ -99,7 +96,6 @@ function readSettings(): ComputerUseSettings {
const parsed = JSON.parse(raw) as Partial<ComputerUseSettings>;
return {
enabled: parsed.enabled === true,
agentToolsEnabled: parsed.agentToolsEnabled === true,
};
} catch (error: any) {
console.warn('[Computer Use] Failed to read settings:', error?.message || error);
@@ -110,7 +106,6 @@ function readSettings(): ComputerUseSettings {
function writeSettings(settings: ComputerUseSettings): ComputerUseSettings {
const normalized = {
enabled: settings.enabled === true,
agentToolsEnabled: settings.agentToolsEnabled === true,
};
appConfigDb.set(COMPUTER_USE_SETTINGS_KEY, JSON.stringify(normalized));
@@ -274,6 +269,20 @@ function canAccessSession(ownerId: string, session: ComputerUseSession): boolean
return session.ownerId === ownerId || session.ownerId === AGENT_OWNER_ID;
}
function normalizeSessionId(sessionId?: string | null): string | null {
if (typeof sessionId !== 'string') {
return null;
}
const trimmed = sessionId.trim();
return trimmed ? trimmed : null;
}
function findActiveAgentSession(): ComputerUseSession | null {
return ownerSessions(AGENT_OWNER_ID)
.filter((session) => session.status === 'ready')
.sort((a, b) => Date.parse(b.updatedAt) - Date.parse(a.updatedAt))[0] || null;
}
async function expireStaleSessions(now = Date.now()): Promise<void> {
for (const session of sessions.values()) {
if (session.status !== 'ready') {
@@ -301,17 +310,6 @@ async function expireStaleSessions(now = Date.now()): Promise<void> {
// `desktopAgentRelay` and applies the returned screenshot. The local server
// itself never touches the OS in cloud mode.
/** One desktop interaction expressed in screenshot-pixel coordinate space. */
export type ComputerAction =
| { type: 'screenshot' }
| { type: 'mouse_move'; point: Point }
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
| { type: 'type'; text: string }
| { type: 'key'; key: string }
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
| { type: 'wait'; ms?: number };
/** Shape the desktop agent returns for any relayed action. */
type RelayResult = {
screenshotDataUrl?: string | null;
@@ -333,14 +331,9 @@ function applyRelayResult(session: ComputerUseSession, result: RelayResult): voi
session.updatedAt = new Date().toISOString();
}
function normalizeAgentWaitMs(ms: number | undefined): number {
if (ms === undefined) {
return DEFAULT_AGENT_WAIT_MS;
}
if (!Number.isFinite(ms)) {
throw new Error('Computer Use wait duration must be a finite number.');
}
return Math.trunc(Math.max(0, Math.min(ms, MAX_AGENT_WAIT_MS)));
function stripSessionArgs(args: Record<string, unknown>): Record<string, unknown> {
const { sessionId: _sessionId, ...toolArgs } = args;
return toolArgs;
}
async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
@@ -349,16 +342,11 @@ async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
applyRelayResult(session, result);
return;
}
const { dataUrl, size } = await captureScreenshotRuntime();
session.screenshotDataUrl = dataUrl;
if (size) {
session.displaySize = size;
}
session.updatedAt = new Date().toISOString();
applyRelayResult(session, await runRawComputerAction({ type: 'screenshot' }, session));
}
/** Runs one action and refreshes the session screenshot afterwards. */
async function performAction(session: ComputerUseSession, action: ComputerAction): Promise<void> {
async function performAction(session: ComputerUseSession, action: RawComputerAction): Promise<void> {
if (getRuntime() === 'cloud') {
const result = (await desktopAgentRelay.relay(action.type, {
...action,
@@ -369,32 +357,7 @@ async function performAction(session: ComputerUseSession, action: ComputerAction
return;
}
switch (action.type) {
case 'screenshot':
break;
case 'mouse_move':
await executor.moveTo(session, action.point);
break;
case 'click':
await executor.click(session, action.button, action.point, action.double === true);
break;
case 'drag':
await executor.drag(session, action.from, action.to, action.button ?? 'left');
break;
case 'type':
await executor.type(action.text);
break;
case 'key':
await executor.pressChord(action.key);
break;
case 'scroll':
await executor.scroll(session, action.direction, action.amount ?? 3, action.point);
break;
case 'wait':
await new Promise((resolve) => setTimeout(resolve, normalizeAgentWaitMs(action.ms)));
break;
}
await refreshScreenshot(session);
applyRelayResult(session, await runRawComputerAction(action, session));
}
/** Reads the current cursor position in screenshot-pixel space. */
@@ -410,7 +373,9 @@ async function getCursorPosition(session: ComputerUseSession): Promise<Point> {
}
return session.cursor ? { x: session.cursor.x, y: session.cursor.y } : { x: 0, y: 0 };
}
return executor.cursorPosition(session);
const result = await runRawComputerAction({ type: 'cursor_position' }, session);
applyRelayResult(session, result);
return result.position || session.cursor || { x: 0, y: 0 };
}
function assertReady(session: ComputerUseSession): void {
@@ -421,14 +386,14 @@ function assertReady(session: ComputerUseSession): void {
/**
* Whether agent tools may operate right now. Cloud mode depends purely on a
* connected desktop agent; local mode depends on the two opt-in settings.
* connected desktop agent; local mode depends on the single feature setting.
*/
function agentToolsAvailable(): boolean {
if (getRuntime() === 'cloud') {
return desktopAgentRelay.isConnected();
}
const settings = readSettings();
return settings.enabled && settings.agentToolsEnabled;
return settings.enabled;
}
function assertAgentToolsAvailable(): void {
@@ -450,21 +415,10 @@ export const computerUseService = {
async updateSettings(settings: Partial<ComputerUseSettings>) {
const current = readSettings();
const enabled = typeof settings.enabled === 'boolean' ? settings.enabled : current.enabled;
const nextSettings = {
...current,
enabled,
agentToolsEnabled: typeof settings.agentToolsEnabled === 'boolean'
? settings.agentToolsEnabled
: enabled,
};
if (!nextSettings.enabled) {
nextSettings.agentToolsEnabled = false;
}
const next = writeSettings(nextSettings);
if (next.agentToolsEnabled) {
const next = writeSettings({ enabled });
if (next.enabled) {
await this.registerAgentMcp();
} else if (current.agentToolsEnabled) {
} else if (current.enabled) {
await this.unregisterAgentMcp();
}
return next;
@@ -487,14 +441,11 @@ export const computerUseService = {
enabled: isCloud ? true : settings.enabled,
runtime: getRuntime(),
available,
requiresDesktopBridge: isCloud,
desktopAgentConnected,
nutInstalled: readiness.nutInstalled,
screenshotInstalled: readiness.screenshotInstalled,
installInProgress: readiness.installInProgress,
sessionCount: sessions.size,
agentToolsEnabled: isCloud ? desktopAgentConnected : settings.agentToolsEnabled,
mcpRecommended: !settings.agentToolsEnabled,
message: available ? 'Computer Use runtime is available.' : getSetupMessage(settings, readiness),
};
},
@@ -704,18 +655,6 @@ export const computerUseService = {
return publicSession(session);
},
async userType(owner: ComputerUseOwner, sessionId: string, text: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
assertReady(session);
await performAction(session, { type: 'type', text });
session.lastAction = 'type';
return publicSession(session);
},
async userPressKey(owner: ComputerUseOwner, sessionId: string, key: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
@@ -730,46 +669,52 @@ export const computerUseService = {
// --- Agent-initiated actions (via MCP) ------------------------------------
async createAgentSession() {
assertAgentToolsAvailable();
return this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
},
async listAgentSessions() {
if (!agentToolsAvailable()) {
return [];
}
await expireStaleSessions();
return [...sessions.values()].map(publicSession);
},
/**
* Resolves a session the agent is allowed to act on. In local mode this
* enforces the in-process per-session consent flag. In cloud mode the linked
* desktop agent is the consent authority (it prompts the user per its own
* consent mode), so this only requires the relay to be connected.
*/
async getConsentedSession(sessionId: string): Promise<ComputerUseSession> {
async getOrCreateAgentSession(): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
const session = sessions.get(sessionId);
await expireStaleSessions();
const existing = findActiveAgentSession();
if (existing) {
return existing;
}
const created = await this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
const session = sessions.get(created.id);
if (!session) {
throw new Error('Computer Use session could not be created.');
}
return session;
},
async getConsentedSession(sessionId?: string): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
const normalizedSessionId = normalizeSessionId(sessionId);
const session = normalizedSessionId
? sessions.get(normalizedSessionId)
: await this.getOrCreateAgentSession();
if (!session) {
throw new Error('Computer Use session not found.');
}
if (getRuntime() !== 'cloud' && !session.agentAccessEnabled) {
throw new Error('Computer Use session is awaiting user consent. Ask the user to grant control in the Computer panel.');
throw new Error(`Computer Use session ${session.id} is awaiting user consent. Ask the user to grant control in the Computer panel.`);
}
assertReady(session);
return session;
},
async agentScreenshot(sessionId: string) {
async agentScreenshot(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
await refreshScreenshot(session);
session.lastAction = 'screenshot';
return publicSession(session);
},
async agentCursorPosition(sessionId: string) {
async agentCursorPosition(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
const point = await getCursorPosition(session);
session.cursor = { ...point, actor: 'agent' };
@@ -777,7 +722,7 @@ export const computerUseService = {
return { session: publicSession(session), position: point };
},
async agentMouseMove(sessionId: string, point: Point) {
async agentMouseMove(sessionId: string | undefined, point: Point) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'mouse_move', point });
session.cursor = { ...point, actor: 'agent' };
@@ -785,39 +730,43 @@ export const computerUseService = {
return publicSession(session);
},
async agentClick(sessionId: string, button: ClickButton, point?: Point, doubleClick = false) {
async agentUnifiedClick(sessionId: string | undefined, input: { button?: ClickButton; point?: Point; clickCount?: number }) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'click', button, point, double: doubleClick });
if (point) {
session.cursor = { ...point, actor: 'agent' };
const button = input.button || 'left';
const clickCount = Math.max(1, Math.min(Math.trunc(input.clickCount || 1), 5));
for (let index = 0; index < clickCount; index += 1) {
await performAction(session, { type: 'click', button, point: input.point, double: false });
}
session.lastAction = doubleClick ? 'double_click' : `${button}_click`;
if (input.point) {
session.cursor = { ...input.point, actor: 'agent' };
}
session.lastAction = clickCount > 1 ? `${button}_click:${clickCount}` : `${button}_click`;
return publicSession(session);
},
async agentDrag(sessionId: string, from: Point, to: Point, button: ClickButton = 'left') {
async agentDrag(sessionId: string | undefined, from: Point, to: Point, button: ClickButton = 'left') {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'drag', from, to, button });
session.cursor = { ...to, actor: 'agent' };
session.lastAction = 'left_click_drag';
session.lastAction = `${button}_drag`;
return publicSession(session);
},
async agentType(sessionId: string, text: string) {
async agentType(sessionId: string | undefined, text: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'type', text });
session.lastAction = 'type';
return publicSession(session);
},
async agentKey(sessionId: string, key: string) {
async agentKey(sessionId: string | undefined, key: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'key', key });
session.lastAction = `key:${key}`;
return publicSession(session);
},
async agentScroll(sessionId: string, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
async agentScroll(sessionId: string | undefined, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
const session = await this.getConsentedSession(sessionId);
const point = typeof input.x === 'number' && typeof input.y === 'number' ? { x: input.x, y: input.y } : undefined;
await performAction(session, { type: 'scroll', direction: input.direction, amount: input.amount, point });
@@ -828,16 +777,48 @@ export const computerUseService = {
return publicSession(session);
},
async agentWait(sessionId: string, timeoutMs?: number) {
async agentWait(sessionId?: string, timeoutMs?: number) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'wait', ms: timeoutMs });
session.lastAction = 'wait';
return publicSession(session);
},
async agentStopSession(sessionId: string) {
async agentStopSession(sessionId?: string) {
assertAgentToolsAvailable();
return this.stopSession({ id: AGENT_OWNER_ID }, sessionId);
const normalizedSessionId = normalizeSessionId(sessionId);
if (normalizedSessionId) {
return this.stopSession({ id: AGENT_OWNER_ID }, normalizedSessionId);
}
await expireStaleSessions();
const existing = findActiveAgentSession();
if (!existing) {
return { stopped: false };
}
return this.stopSession({ id: AGENT_OWNER_ID }, existing.id);
},
async callSemanticTool(toolName: string, args: Record<string, unknown>) {
if (!semanticOperationNames.has(toolName)) {
throw new Error(`Unsupported semantic Computer Use tool: ${toolName}`);
}
const sessionId = typeof args.sessionId === 'string' ? args.sessionId : undefined;
const session = await this.getConsentedSession(normalizeSessionId(sessionId) ?? undefined);
const toolArgs = { ...stripSessionArgs(args), sessionId: session.id };
const semanticResult = getRuntime() === 'cloud'
? await desktopAgentRelay.relay('semantic_tool', {
sessionId: session.id,
displaySize: session.displaySize,
toolName,
arguments: toolArgs,
})
: await computerSemanticsService.callTool(toolName, toolArgs);
applyRelayResult(session, semanticResult as RelayResult);
session.lastAction = `semantic:${toolName}`;
return { session: publicSession(session), result: semanticResult };
},
/**

View File

@@ -0,0 +1,82 @@
import { SemanticHelperProcess } from '@/modules/computer-use/semantics/helpers/semantic-helper-process.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import type { SemanticAdapter, SemanticAdapterCapabilities } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
type HelperMethod =
| 'list_apps'
| 'get_app_state'
| 'click_element'
| 'perform_secondary_action'
| 'set_value'
| 'type_text'
| 'press_key'
| 'scroll_element'
| 'drag';
export class HelperSemanticAdapter implements SemanticAdapter {
private helper: SemanticHelperProcess | null = null;
constructor(
private readonly platform: NodeJS.Platform,
private readonly arch: NodeJS.Architecture = process.arch,
) {}
capabilities(): SemanticAdapterCapabilities {
return {
platform: this.platform,
appDiscovery: true,
accessibilityTree: true,
nativeElementActions: true,
nativeValueSetting: true,
targetedInput: true,
};
}
async listApps(): Promise<SemanticApp[]> {
return await this.request('list_apps', {}) as SemanticApp[];
}
async getAppState(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('get_app_state', input) as SemanticAppState;
}
async clickElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('click_element', input) as SemanticAppState;
}
async performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('perform_secondary_action', input) as SemanticAppState;
}
async setValue(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('set_value', input) as SemanticAppState;
}
async typeText(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('type_text', input) as SemanticAppState;
}
async pressKey(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('press_key', input) as SemanticAppState;
}
async scrollElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('scroll_element', input) as SemanticAppState;
}
async drag(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('drag', input) as SemanticAppState;
}
private async request(method: HelperMethod, params: Record<string, unknown>): Promise<unknown> {
if (!this.helper) {
const resolution = resolveSemanticHelper(this.platform, this.arch);
if (!resolution.available || !resolution.path) {
throw new Error(resolution.reason || `Semantic helper is unavailable for ${this.platform}-${this.arch}.`);
}
this.helper = new SemanticHelperProcess(resolution.path);
}
return this.helper.request(method, params);
}
}

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createMacOsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('darwin');
}

View File

@@ -0,0 +1,23 @@
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
export type SemanticAdapterCapabilities = {
platform: NodeJS.Platform;
appDiscovery: boolean;
accessibilityTree: boolean;
nativeElementActions: boolean;
nativeValueSetting: boolean;
targetedInput: boolean;
};
export type SemanticAdapter = {
capabilities(): SemanticAdapterCapabilities;
listApps(): Promise<SemanticApp[]>;
getAppState(input: SemanticToolInput): Promise<SemanticAppState>;
clickElement(input: SemanticToolInput): Promise<SemanticAppState>;
performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState>;
setValue(input: SemanticToolInput): Promise<SemanticAppState>;
typeText(input: SemanticToolInput): Promise<SemanticAppState>;
pressKey(input: SemanticToolInput): Promise<SemanticAppState>;
scrollElement(input: SemanticToolInput): Promise<SemanticAppState>;
drag(input: SemanticToolInput): Promise<SemanticAppState>;
};

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createWindowsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('win32');
}

View File

@@ -0,0 +1,437 @@
import AppKit
import ApplicationServices
import Foundation
typealias JSON = [String: Any]
struct ElementRecord {
let index: String
let role: String
let title: String?
let value: String?
let bounds: [String: Double]?
let actions: [String]
}
var stateElements: [String: [ElementRecord]] = [:]
var stateAxElements: [String: [String: AXUIElement]] = [:]
func jsonLine(_ object: Any) {
guard JSONSerialization.isValidJSONObject(object),
let data = try? JSONSerialization.data(withJSONObject: object),
let text = String(data: data, encoding: .utf8)
else {
print("{\"error\":\"Failed to encode JSON\"}")
fflush(stdout)
return
}
print(text)
fflush(stdout)
}
func respond(id: Any?, result: Any) {
jsonLine(["id": id ?? NSNull(), "result": result])
}
func respondError(id: Any?, _ message: String) {
jsonLine(["id": id ?? NSNull(), "error": message])
}
func stringAttr(_ element: AXUIElement, _ attr: CFString) -> String? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? String
}
func boolAttr(_ element: AXUIElement, _ attr: CFString) -> Bool? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? Bool
}
func arrayAttr(_ element: AXUIElement, _ attr: CFString) -> [AXUIElement] {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return [] }
return value as? [AXUIElement] ?? []
}
func actions(_ element: AXUIElement) -> [String] {
var names: CFArray?
guard AXUIElementCopyActionNames(element, &names) == .success else { return [] }
return names as? [String] ?? []
}
func bounds(_ element: AXUIElement) -> [String: Double]? {
var positionRef: CFTypeRef?
var sizeRef: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionRef) == .success,
AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeRef) == .success,
let positionValue = positionRef,
let sizeValue = sizeRef
else { return nil }
var point = CGPoint.zero
var size = CGSize.zero
guard AXValueGetValue(positionValue as! AXValue, .cgPoint, &point),
AXValueGetValue(sizeValue as! AXValue, .cgSize, &size)
else { return nil }
return [
"x": Double(point.x),
"y": Double(point.y),
"width": Double(size.width),
"height": Double(size.height),
]
}
func record(_ element: AXUIElement, index: String) -> ElementRecord {
ElementRecord(
index: index,
role: stringAttr(element, kAXRoleAttribute as CFString) ?? "AXUnknown",
title: stringAttr(element, kAXTitleAttribute as CFString) ?? stringAttr(element, kAXDescriptionAttribute as CFString),
value: stringAttr(element, kAXValueAttribute as CFString),
bounds: bounds(element),
actions: actions(element)
)
}
func cachedElement(_ params: JSON) -> AXUIElement? {
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String
else {
return nil
}
return stateAxElements[stateId]?[elementIndex]
}
func dictionary(_ record: ElementRecord) -> JSON {
var output: JSON = [
"index": record.index,
"role": record.role,
"actions": record.actions,
]
if let title = record.title { output["title"] = title }
if let value = record.value { output["value"] = value }
if let bounds = record.bounds { output["bounds"] = bounds }
return output
}
func resolveApp(_ query: String) throws -> NSRunningApplication {
let normalized = query.lowercased()
let apps = NSWorkspace.shared.runningApplications.filter { app in
app.activationPolicy == .regular
}
if let app = apps.first(where: { $0.bundleIdentifier?.lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased().contains(normalized) }) {
return app
}
throw NSError(domain: "CloudCLISemantics", code: 404, userInfo: [NSLocalizedDescriptionKey: "App is not running: \(query)"])
}
func listApps() -> [[String: Any]] {
NSWorkspace.shared.runningApplications
.filter { $0.activationPolicy == .regular }
.map { app in
[
"id": app.bundleIdentifier ?? app.localizedName ?? "\(app.processIdentifier)",
"name": app.localizedName ?? app.bundleIdentifier ?? "Unknown",
"bundleIdentifier": app.bundleIdentifier ?? "",
"pid": Int(app.processIdentifier),
"running": true,
]
}
}
func walk(_ element: AXUIElement, depth: Int, maxDepth: Int, records: inout [ElementRecord], axRecords: inout [String: AXUIElement], limit: Int) {
if depth > maxDepth || records.count >= limit { return }
let index = "\(records.count + 1)"
records.append(record(element, index: index))
axRecords[index] = element
for child in arrayAttr(element, kAXChildrenAttribute as CFString) {
walk(child, depth: depth + 1, maxDepth: maxDepth, records: &records, axRecords: &axRecords, limit: limit)
if records.count >= limit { return }
}
}
func pngDataUrlForMainDisplay() -> String? {
guard let image = CGDisplayCreateImage(CGMainDisplayID()) else { return nil }
let bitmap = NSBitmapImageRep(cgImage: image)
guard let png = bitmap.representation(using: .png, properties: [:]) else { return nil }
return "data:image/png;base64,\(png.base64EncodedString())"
}
func getAppState(_ params: JSON) throws -> JSON {
let appName = params["app"] as? String ?? ""
let app = try resolveApp(appName)
let axApp = AXUIElementCreateApplication(app.processIdentifier)
let windows = arrayAttr(axApp, kAXWindowsAttribute as CFString)
let root = windows.first ?? axApp
var records: [ElementRecord] = []
var axRecords: [String: AXUIElement] = [:]
walk(root, depth: 0, maxDepth: 5, records: &records, axRecords: &axRecords, limit: 300)
let stateId = "state_\(UUID().uuidString)"
stateElements[stateId] = records
stateAxElements[stateId] = axRecords
let elements = records.map(dictionary)
return [
"stateId": stateId,
"app": app.localizedName ?? app.bundleIdentifier ?? appName,
"platform": "darwin",
"screenshotDataUrl": pngDataUrlForMainDisplay() ?? NSNull(),
"displaySize": [
"width": Int(CGDisplayPixelsWide(CGMainDisplayID())),
"height": Int(CGDisplayPixelsHigh(CGMainDisplayID())),
],
"elements": elements,
"accessibilityTree": elements,
"treeText": elements.map { "\($0["index"] ?? "") \($0["role"] ?? "") \($0["title"] ?? "")" }.joined(separator: "\n"),
]
}
func cgMouseButton(_ value: Any?) -> CGMouseButton {
guard let button = value as? String else { return .left }
switch button {
case "right": return .right
case "middle": return .center
default: return .left
}
}
func mouseEventTypes(_ button: CGMouseButton) -> (CGEventType, CGEventType) {
switch button {
case .right: return (.rightMouseDown, .rightMouseUp)
case .center: return (.otherMouseDown, .otherMouseUp)
default: return (.leftMouseDown, .leftMouseUp)
}
}
func postMouseClick(point: CGPoint, button: CGMouseButton, clickCount: Int = 1) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
for _ in 0..<max(1, clickCount) {
let down = CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: point, mouseButton: button)
let up = CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: point, mouseButton: button)
down?.post(tap: .cghidEventTap)
up?.post(tap: .cghidEventTap)
usleep(80_000)
}
}
func postDrag(from: CGPoint, to: CGPoint, button: CGMouseButton) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: from, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
}
func runAppleScript(_ script: String) throws {
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/bin/osascript")
process.arguments = ["-e", script]
process.standardOutput = Pipe()
let stderr = Pipe()
process.standardError = stderr
try process.run()
process.waitUntilExit()
if process.terminationStatus != 0 {
let data = stderr.fileHandleForReading.readDataToEndOfFile()
let message = String(data: data, encoding: .utf8) ?? "AppleScript failed."
throw NSError(domain: "CloudCLISemantics", code: Int(process.terminationStatus), userInfo: [NSLocalizedDescriptionKey: message])
}
}
func escapedAppleScriptString(_ value: String) -> String {
value.replacingOccurrences(of: "\\", with: "\\\\").replacingOccurrences(of: "\"", with: "\\\"")
}
func pointForElement(_ params: JSON) -> CGPoint? {
if let x = params["x"] as? Double, let y = params["y"] as? Double {
return CGPoint(x: x, y: y)
}
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String,
let element = stateElements[stateId]?.first(where: { $0.index == elementIndex }),
let b = element.bounds,
let x = b["x"], let y = b["y"], let width = b["width"], let height = b["height"]
else {
return nil
}
return CGPoint(x: x + width / 2, y: y + height / 2)
}
func click(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
cgMouseButton(params["mouse_button"]) == .left,
(params["click_count"] as? Int ?? 1) == 1,
actions(element).contains(kAXPressAction as String),
AXUIElementPerformAction(element, kAXPressAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "click_element requires x/y or stateId + element_index"])
}
let clickCount = params["click_count"] as? Int ?? 1
try postMouseClick(point: point, button: cgMouseButton(params["mouse_button"]), clickCount: clickCount)
return try getAppState(params)
}
func performSecondaryAction(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
actions(element).contains(kAXShowMenuAction as String),
AXUIElementPerformAction(element, kAXShowMenuAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "perform_secondary_action requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .right)
return try getAppState(params)
}
func setValue(_ params: JSON) throws -> JSON {
guard let value = params["value"] as? String else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires value"])
}
if let element = cachedElement(params),
AXUIElementSetAttributeValue(element, kAXValueAttribute as CFString, value as CFTypeRef) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .left)
try runAppleScript("tell application \"System Events\" to keystroke \"a\" using command down")
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(value))\"")
return try getAppState(params)
}
func typeText(_ params: JSON) throws -> JSON {
let text = params["text"] as? String ?? ""
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(text))\"")
return try getAppState(params)
}
func appleScriptModifiers(_ parts: [String]) -> String {
let modifiers = parts.compactMap { part -> String? in
switch part.lowercased() {
case "cmd", "command", "meta": return "command down"
case "ctrl", "control": return "control down"
case "alt", "option": return "option down"
case "shift": return "shift down"
default: return nil
}
}
return modifiers.isEmpty ? "" : " using {\(modifiers.joined(separator: ", "))}"
}
func appleScriptKeyCode(_ key: String) -> Int? {
switch key.lowercased() {
case "return", "enter": return 36
case "tab": return 48
case "space": return 49
case "delete", "backspace": return 51
case "escape", "esc": return 53
case "left": return 123
case "right": return 124
case "down": return 125
case "up": return 126
default: return nil
}
}
func pressKey(_ params: JSON) throws -> JSON {
let raw = params["key"] as? String ?? ""
let parts = raw.split(separator: "+").map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
let key = parts.last ?? raw
let modifiers = appleScriptModifiers(Array(parts.dropLast()))
if let keyCode = appleScriptKeyCode(key) {
try runAppleScript("tell application \"System Events\" to key code \(keyCode)\(modifiers)")
} else {
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(key))\"\(modifiers)")
}
return try getAppState(params)
}
func scrollElement(_ params: JSON) throws -> JSON {
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "scroll_element requires x/y or stateId + element_index"])
}
CGWarpMouseCursorPosition(point)
let direction = params["direction"] as? String ?? "down"
let pages = params["pages"] as? Double ?? 1.0
let amount = Int32(max(1.0, abs(pages) * 8.0))
let vertical = direction == "up" ? amount : direction == "down" ? -amount : 0
let horizontal = direction == "left" ? amount : direction == "right" ? -amount : 0
CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 2, wheel1: vertical, wheel2: horizontal)?.post(tap: .cghidEventTap)
return try getAppState(params)
}
func drag(_ params: JSON) throws -> JSON {
guard let fromX = params["from_x"] as? Double,
let fromY = params["from_y"] as? Double,
let toX = params["to_x"] as? Double,
let toY = params["to_y"] as? Double
else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "drag requires from_x/from_y/to_x/to_y"])
}
try postDrag(from: CGPoint(x: fromX, y: fromY), to: CGPoint(x: toX, y: toY), button: cgMouseButton(params["mouse_button"]))
return try getAppState(params)
}
func handle(_ request: JSON) {
let id = request["id"]
let method = request["method"] as? String ?? ""
let params = request["params"] as? JSON ?? [:]
do {
switch method {
case "list_apps":
respond(id: id, result: listApps())
case "get_app_state":
respond(id: id, result: try getAppState(params))
case "click_element":
respond(id: id, result: try click(params))
case "perform_secondary_action":
respond(id: id, result: try performSecondaryAction(params))
case "set_value":
respond(id: id, result: try setValue(params))
case "type_text":
respond(id: id, result: try typeText(params))
case "press_key":
respond(id: id, result: try pressKey(params))
case "scroll_element":
respond(id: id, result: try scrollElement(params))
case "drag":
respond(id: id, result: try drag(params))
default:
respondError(id: id, "Method is not implemented yet: \(method)")
}
} catch {
respondError(id: id, error.localizedDescription)
}
}
while let line = readLine() {
guard let data = line.data(using: .utf8),
let object = try? JSONSerialization.jsonObject(with: data),
let request = object as? JSON
else {
respondError(id: nil, "Invalid JSON request")
continue
}
handle(request)
}

View File

@@ -0,0 +1,124 @@
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import readline from 'node:readline';
type JsonRecord = Record<string, unknown>;
type PendingRequest = {
resolve: (value: unknown) => void;
reject: (error: Error) => void;
timer: ReturnType<typeof setTimeout>;
};
const DEFAULT_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_SEMANTICS_HELPER_TIMEOUT_MS || '60000', 10);
function timeoutMs(): number {
return Number.isFinite(DEFAULT_TIMEOUT_MS) && DEFAULT_TIMEOUT_MS > 0 ? DEFAULT_TIMEOUT_MS : 60000;
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
export class SemanticHelperProcess {
private child: ChildProcessWithoutNullStreams | null = null;
private reader: readline.Interface | null = null;
private nextId = 1;
private pending = new Map<number, PendingRequest>();
constructor(private readonly executablePath: string) {}
async request(method: string, params: JsonRecord): Promise<unknown> {
this.ensureStarted();
const child = this.child;
if (!child?.stdin.writable) {
throw new Error('Semantic helper process is not running.');
}
const id = this.nextId++;
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
this.pending.delete(id);
reject(new Error(`Semantic helper request timed out: ${method}`));
}, timeoutMs());
this.pending.set(id, { resolve, reject, timer });
child.stdin.write(`${JSON.stringify({ id, method, params })}\n`);
});
}
stop(): void {
const child = this.child;
this.child = null;
this.reader?.close();
this.reader = null;
this.rejectAll('Semantic helper stopped.');
if (child) {
try { child.kill('SIGTERM'); } catch { /* noop */ }
}
}
private ensureStarted(): void {
if (this.child) {
return;
}
this.child = spawn(this.executablePath, [], {
stdio: ['pipe', 'pipe', 'pipe'],
windowsHide: true,
});
this.reader = readline.createInterface({ input: this.child.stdout });
this.reader.on('line', (line) => this.handleLine(line));
this.child.stderr.on('data', (chunk) => {
const text = String(chunk).trim();
if (text) {
console.error('[SemanticHelper]', text);
}
});
this.child.once('error', (error) => {
this.child = null;
this.rejectAll(`Failed to start semantic helper: ${error.message}`);
});
this.child.once('exit', (code) => {
this.child = null;
this.rejectAll(`Semantic helper exited with code ${code ?? 'null'}.`);
});
}
private handleLine(line: string): void {
let message: JsonRecord;
try {
message = JSON.parse(line) as JsonRecord;
} catch (error) {
console.error('[SemanticHelper] Invalid JSON response:', errorMessage(error));
return;
}
const id = typeof message.id === 'number' ? message.id : null;
if (id === null) {
return;
}
const pending = this.pending.get(id);
if (!pending) {
return;
}
clearTimeout(pending.timer);
this.pending.delete(id);
if (message.error) {
pending.reject(new Error(typeof message.error === 'string' ? message.error : 'Semantic helper request failed.'));
return;
}
pending.resolve(message.result);
}
private rejectAll(reason: string): void {
for (const [id, request] of this.pending.entries()) {
clearTimeout(request.timer);
request.reject(new Error(reason));
this.pending.delete(id);
}
}
}

View File

@@ -0,0 +1,97 @@
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export type SemanticHelperPlatform = 'darwin' | 'win32';
export type SemanticHelperResolution = {
available: boolean;
path: string | null;
source: 'bundled' | 'dev' | 'missing';
platform: NodeJS.Platform;
arch: NodeJS.Architecture;
reason?: string;
};
function helperExecutableName(platform: NodeJS.Platform): string | null {
if (platform === 'darwin') {
return 'CloudCLISemantics';
}
if (platform === 'win32') {
return 'CloudCLISemantics.exe';
}
return null;
}
function pathExists(filePath: string): boolean {
try {
fs.accessSync(filePath, fs.constants.X_OK);
return true;
} catch {
try {
fs.accessSync(filePath, fs.constants.F_OK);
return true;
} catch {
return false;
}
}
}
function candidatePaths(platform: NodeJS.Platform, arch: NodeJS.Architecture): Array<{ source: 'bundled' | 'dev'; path: string }> {
const executable = helperExecutableName(platform);
if (!executable) {
return [];
}
const platformArch = `${platform}-${arch}`;
return [
{
source: 'bundled',
path: path.resolve(__dirname, '..', 'bin', platformArch, executable),
},
{
source: 'dev',
path: path.resolve(process.cwd(), 'server', 'modules', 'computer-use', 'semantics', 'bin', platformArch, executable),
},
];
}
export function resolveSemanticHelper(
platform: NodeJS.Platform = process.platform,
arch: NodeJS.Architecture = process.arch,
): SemanticHelperResolution {
const executable = helperExecutableName(platform);
if (!executable) {
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Semantic Computer Use helper is not supported on ${platform}.`,
};
}
for (const candidate of candidatePaths(platform, arch)) {
if (pathExists(candidate.path)) {
return {
available: true,
path: candidate.path,
source: candidate.source,
platform,
arch,
};
}
}
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Bundled semantic helper was not found for ${platform}-${arch} (${executable}).`,
};
}

View File

@@ -0,0 +1,10 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0-windows</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UseWindowsForms>true</UseWindowsForms>
<AssemblyName>CloudCLISemantics</AssemblyName>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,519 @@
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using System.Text.Json;
using System.Windows.Automation;
static class Program
{
private static readonly Dictionary<string, List<ElementRecord>> StateElements = new();
private static readonly Dictionary<string, Dictionary<string, AutomationElement>> StateAutomationElements = new();
public static void Main()
{
string? line;
while ((line = Console.ReadLine()) != null)
{
try
{
using var doc = JsonDocument.Parse(line);
var root = doc.RootElement;
var id = root.TryGetProperty("id", out var idValue) ? idValue.Clone() : default;
var method = root.TryGetProperty("method", out var methodValue) ? methodValue.GetString() ?? "" : "";
var parameters = root.TryGetProperty("params", out var paramsValue) && paramsValue.ValueKind == JsonValueKind.Object
? paramsValue.Clone()
: JsonDocument.Parse("{}").RootElement.Clone();
try
{
object result = method switch
{
"list_apps" => ListApps(),
"get_app_state" => GetAppState(parameters),
"click_element" => ClickElement(parameters),
"perform_secondary_action" => PerformSecondaryAction(parameters),
"set_value" => SetValue(parameters),
"type_text" => TypeText(parameters),
"press_key" => PressKey(parameters),
"scroll_element" => ScrollElement(parameters),
"drag" => Drag(parameters),
_ => throw new InvalidOperationException($"Method is not implemented yet: {method}")
};
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["result"] = result });
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["error"] = ex.Message });
}
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = null, ["error"] = $"Invalid JSON request: {ex.Message}" });
}
}
}
private static object? JsonValue(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString(),
JsonValueKind.Number => element.TryGetInt64(out var number) ? number : element.GetDouble(),
JsonValueKind.True => true,
JsonValueKind.False => false,
_ => null
};
}
private static void Write(object value)
{
Console.WriteLine(JsonSerializer.Serialize(value));
Console.Out.Flush();
}
private static List<Dictionary<string, object?>> ListApps()
{
return Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.OrderBy(process => process.ProcessName)
.Select(process => new Dictionary<string, object?>
{
["id"] = process.Id.ToString(),
["name"] = process.ProcessName,
["processName"] = process.ProcessName,
["pid"] = process.Id,
["running"] = true,
["windowTitle"] = process.MainWindowTitle
})
.ToList();
}
private static Process ResolveProcess(string query)
{
var normalized = query.Trim();
if (string.IsNullOrWhiteSpace(normalized))
{
throw new InvalidOperationException("app is required.");
}
var processes = Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.ToList();
return processes.FirstOrDefault(process => process.ProcessName.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Contains(normalized, StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"App is not running: {query}");
}
private static Dictionary<string, object?> GetAppState(JsonElement parameters)
{
var appQuery = ReadString(parameters, "app");
var process = ResolveProcess(appQuery);
var root = AutomationElement.FromHandle(process.MainWindowHandle)
?? throw new InvalidOperationException("No UI Automation root window is available.");
var records = new List<ElementRecord>();
var automationElements = new Dictionary<string, AutomationElement>();
Walk(root, records, automationElements, 0, 5, 300);
var stateId = $"state_{Guid.NewGuid()}";
StateElements[stateId] = records;
StateAutomationElements[stateId] = automationElements;
var elements = records.Select(record => record.ToDictionary()).ToList();
var bounds = root.Current.BoundingRectangle;
return new Dictionary<string, object?>
{
["stateId"] = stateId,
["app"] = process.ProcessName,
["platform"] = "win32",
["screenshotDataUrl"] = CaptureScreen(),
["displaySize"] = new Dictionary<string, object?>
{
["width"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Width,
["height"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Height
},
["window"] = new Dictionary<string, object?>
{
["title"] = process.MainWindowTitle,
["bounds"] = BoundsDictionary(bounds)
},
["elements"] = elements,
["accessibilityTree"] = elements,
["treeText"] = string.Join("\n", elements.Select(element => $"{element["index"]} {element["role"]} {element.GetValueOrDefault("title")}"))
};
}
private static Dictionary<string, object?> ClickElement(JsonElement parameters)
{
var mouseButton = ReadString(parameters, "mouse_button");
if ((mouseButton == "" || mouseButton == "left") && ReadInt(parameters, "click_count", 1) == 1)
{
var element = AutomationElementFor(parameters);
if (element != null && TryInvoke(element))
{
return GetAppState(parameters);
}
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("click_element requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, ReadString(parameters, "mouse_button"), ReadInt(parameters, "click_count", 1));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PerformSecondaryAction(JsonElement parameters)
{
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("perform_secondary_action requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, "right", 1);
return GetAppState(parameters);
}
private static Dictionary<string, object?> SetValue(JsonElement parameters)
{
var value = ReadString(parameters, "value");
var element = AutomationElementFor(parameters);
var focused = false;
if (element != null)
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var valuePattern))
{
((ValuePattern)valuePattern).SetValue(value);
return GetAppState(parameters);
}
try
{
element.SetFocus();
focused = true;
}
catch
{
// Fall through to coordinate focus below.
}
}
var point = PointFor(parameters);
if (point != null)
{
SendMouseClick(point.Value.X, point.Value.Y, "left", 1);
focused = true;
}
else if (!focused && element == null)
{
throw new InvalidOperationException("set_value requires x/y or stateId + element_index.");
}
else if (!focused)
{
throw new InvalidOperationException("set_value could not focus the requested element.");
}
System.Windows.Forms.SendKeys.SendWait("^a");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(value));
return GetAppState(parameters);
}
private static Dictionary<string, object?> TypeText(JsonElement parameters)
{
var text = ReadString(parameters, "text");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(text));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PressKey(JsonElement parameters)
{
var key = ReadString(parameters, "key");
System.Windows.Forms.SendKeys.SendWait(ToSendKeysChord(key));
return GetAppState(parameters);
}
private static Dictionary<string, object?> ScrollElement(JsonElement parameters)
{
var element = AutomationElementFor(parameters);
var direction = ReadString(parameters, "direction");
var pages = ReadDouble(parameters, "pages", 1);
if (element != null && element.TryGetCurrentPattern(ScrollPattern.Pattern, out var scrollPatternValue))
{
var scrollPattern = (ScrollPattern)scrollPatternValue;
var vertical = direction == "up" ? ScrollAmount.LargeDecrement : direction == "down" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
var horizontal = direction == "left" ? ScrollAmount.LargeDecrement : direction == "right" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
scrollPattern.Scroll(horizontal, vertical);
return GetAppState(parameters);
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("scroll_element requires x/y or stateId + element_index.");
}
SetCursorPos(point.Value.X, point.Value.Y);
var wheel = (int)Math.Round(Math.Max(1, pages) * 120);
if (direction == "up") wheel = -wheel;
mouse_event(0x0800, 0, 0, unchecked((uint)wheel), UIntPtr.Zero);
return GetAppState(parameters);
}
private static Dictionary<string, object?> Drag(JsonElement parameters)
{
var fromX = ReadDouble(parameters, "from_x", double.NaN);
var fromY = ReadDouble(parameters, "from_y", double.NaN);
var toX = ReadDouble(parameters, "to_x", double.NaN);
var toY = ReadDouble(parameters, "to_y", double.NaN);
if (double.IsNaN(fromX) || double.IsNaN(fromY) || double.IsNaN(toX) || double.IsNaN(toY))
{
throw new InvalidOperationException("drag requires from_x/from_y/to_x/to_y.");
}
SetCursorPos((int)Math.Round(fromX), (int)Math.Round(fromY));
mouse_event(0x0002, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
SetCursorPos((int)Math.Round(toX), (int)Math.Round(toY));
Thread.Sleep(80);
mouse_event(0x0004, 0, 0, 0, UIntPtr.Zero);
return GetAppState(parameters);
}
private static void Walk(AutomationElement element, List<ElementRecord> records, Dictionary<string, AutomationElement> automationElements, int depth, int maxDepth, int limit)
{
if (depth > maxDepth || records.Count >= limit) return;
var index = (records.Count + 1).ToString();
records.Add(ElementRecord.From(element, index));
automationElements[index] = element;
var children = element.FindAll(TreeScope.Children, Condition.TrueCondition);
foreach (AutomationElement child in children)
{
Walk(child, records, automationElements, depth + 1, maxDepth, limit);
if (records.Count >= limit) return;
}
}
private static string ReadString(JsonElement element, string property)
{
return element.TryGetProperty(property, out var value) && value.ValueKind == JsonValueKind.String
? value.GetString() ?? ""
: "";
}
private static int ReadInt(JsonElement element, string property, int defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetInt32(out var number)
? number
: defaultValue;
}
private static double ReadDouble(JsonElement element, string property, double defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetDouble(out var number)
? number
: defaultValue;
}
private static AutomationElement? AutomationElementFor(JsonElement parameters)
{
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
return !string.IsNullOrWhiteSpace(stateId)
&& !string.IsNullOrWhiteSpace(elementIndex)
&& StateAutomationElements.TryGetValue(stateId, out var elements)
&& elements.TryGetValue(elementIndex, out var element)
? element
: null;
}
private static System.Drawing.Point? PointFor(JsonElement parameters)
{
if (parameters.TryGetProperty("x", out var xValue) && parameters.TryGetProperty("y", out var yValue)
&& xValue.TryGetDouble(out var x) && yValue.TryGetDouble(out var y))
{
return new System.Drawing.Point((int)Math.Round(x), (int)Math.Round(y));
}
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
if (string.IsNullOrWhiteSpace(stateId) || string.IsNullOrWhiteSpace(elementIndex)) return null;
if (!StateElements.TryGetValue(stateId, out var elements)) return null;
var element = elements.FirstOrDefault(item => item.Index == elementIndex);
if (element?.Bounds == null) return null;
return new System.Drawing.Point(
(int)Math.Round(element.Bounds.Value.Left + element.Bounds.Value.Width / 2),
(int)Math.Round(element.Bounds.Value.Top + element.Bounds.Value.Height / 2)
);
}
private static string CaptureScreen()
{
var bounds = System.Windows.Forms.Screen.PrimaryScreen!.Bounds;
using var bitmap = new Bitmap(bounds.Width, bounds.Height);
using var graphics = Graphics.FromImage(bitmap);
graphics.CopyFromScreen(bounds.Left, bounds.Top, 0, 0, bounds.Size);
using var stream = new MemoryStream();
bitmap.Save(stream, ImageFormat.Png);
return $"data:image/png;base64,{Convert.ToBase64String(stream.ToArray())}";
}
private static Dictionary<string, object?> BoundsDictionary(System.Windows.Rect rect)
{
return new Dictionary<string, object?>
{
["x"] = rect.X,
["y"] = rect.Y,
["width"] = rect.Width,
["height"] = rect.Height
};
}
[DllImport("user32.dll")]
private static extern bool SetCursorPos(int x, int y);
[DllImport("user32.dll")]
private static extern void mouse_event(uint dwFlags, uint dx, uint dy, uint dwData, UIntPtr dwExtraInfo);
private static void SendMouseClick(int x, int y, string button, int clickCount)
{
var (down, up) = button switch
{
"right" => (0x0008u, 0x0010u),
"middle" => (0x0020u, 0x0040u),
_ => (0x0002u, 0x0004u)
};
SetCursorPos(x, y);
for (var i = 0; i < Math.Max(1, clickCount); i++)
{
mouse_event(down, 0, 0, 0, UIntPtr.Zero);
mouse_event(up, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
}
}
private static bool TryInvoke(AutomationElement element)
{
try
{
if (!element.TryGetCurrentPattern(InvokePattern.Pattern, out var pattern)) return false;
((InvokePattern)pattern).Invoke();
return true;
}
catch
{
return false;
}
}
private static string EscapeSendKeys(string value)
{
return value
.Replace("{", "{{}")
.Replace("}", "{}}")
.Replace("+", "{+}")
.Replace("^", "{^}")
.Replace("%", "{%}")
.Replace("~", "{~}")
.Replace("(", "{(}")
.Replace(")", "{)}")
.Replace("[", "{[}")
.Replace("]", "{]}");
}
private static string ToSendKeysChord(string key)
{
var normalized = key.Trim();
if (normalized.Contains('+'))
{
var parts = normalized.Split('+', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
var modifiers = "";
var last = parts.LastOrDefault() ?? "";
foreach (var part in parts.Take(parts.Length - 1))
{
modifiers += part.ToLowerInvariant() switch
{
"ctrl" or "control" => "^",
"alt" => "%",
"shift" => "+",
"cmd" or "win" or "windows" => "^",
_ => ""
};
}
return modifiers + SendKeyName(last);
}
return SendKeyName(normalized);
}
private static string SendKeyName(string key)
{
return key.ToLowerInvariant() switch
{
"return" or "enter" => "{ENTER}",
"escape" or "esc" => "{ESC}",
"tab" => "{TAB}",
"backspace" => "{BACKSPACE}",
"delete" or "del" => "{DELETE}",
"left" => "{LEFT}",
"right" => "{RIGHT}",
"up" => "{UP}",
"down" => "{DOWN}",
"space" => " ",
_ => key.Length == 1 ? EscapeSendKeys(key) : $"{{{key.ToUpperInvariant()}}}"
};
}
private sealed record ElementRecord(
string Index,
string Role,
string? Title,
string? Value,
System.Windows.Rect? Bounds,
List<string> Actions)
{
public static ElementRecord From(AutomationElement element, string index)
{
var patterns = element.GetSupportedPatterns().Select(pattern => pattern.ProgrammaticName).ToList();
return new ElementRecord(
index,
element.Current.ControlType.ProgrammaticName.Replace("ControlType.", ""),
element.Current.Name,
TryValue(element),
element.Current.BoundingRectangle,
patterns
);
}
public Dictionary<string, object?> ToDictionary()
{
var output = new Dictionary<string, object?>
{
["index"] = Index,
["role"] = Role,
["actions"] = Actions
};
if (!string.IsNullOrEmpty(Title)) output["title"] = Title;
if (!string.IsNullOrEmpty(Value)) output["value"] = Value;
if (Bounds != null) output["bounds"] = BoundsDictionary(Bounds.Value);
return output;
}
private static string? TryValue(AutomationElement element)
{
try
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var pattern))
{
return ((ValuePattern)pattern).Current.Value;
}
}
catch
{
return null;
}
return null;
}
}
}

View File

@@ -0,0 +1,83 @@
import { randomUUID } from 'node:crypto';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const DEFAULT_STATE_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_SEMANTIC_STATE_TTL_MS || String(10 * 60 * 1000), 10);
type StoredState = {
sessionId: string;
appKey: string;
state: SemanticAppState;
updatedAt: number;
};
function normalizeAppKey(app: string): string {
return app.trim().toLowerCase();
}
export class SemanticSessionStore {
private states = new Map<string, StoredState>();
private latestBySessionApp = new Map<string, string>();
createStateId(): string {
return `state_${randomUUID()}`;
}
save(sessionId: string, state: SemanticAppState): SemanticAppState {
const appKey = normalizeAppKey(state.app);
const nextState = {
...state,
stateId: state.stateId || this.createStateId(),
};
this.states.set(nextState.stateId, {
sessionId,
appKey,
state: nextState,
updatedAt: Date.now(),
});
this.latestBySessionApp.set(this.latestKey(sessionId, appKey), nextState.stateId);
return nextState;
}
getState(sessionId: string, app: string, stateId?: string): SemanticAppState | null {
this.expire();
if (stateId) {
const entry = this.states.get(stateId);
return entry && entry.sessionId === sessionId ? entry.state : null;
}
const latestStateId = this.latestBySessionApp.get(this.latestKey(sessionId, normalizeAppKey(app)));
return latestStateId ? this.states.get(latestStateId)?.state || null : null;
}
getElement(sessionId: string, app: string, elementIndex: string, stateId?: string): SemanticElement | null {
const state = this.getState(sessionId, app, stateId);
return state?.elements.find((element) => element.index === elementIndex) || null;
}
clearSession(sessionId: string): void {
for (const [stateId, entry] of this.states.entries()) {
if (entry.sessionId === sessionId) {
this.states.delete(stateId);
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
}
}
}
expire(now = Date.now()): void {
const ttl = Number.isFinite(DEFAULT_STATE_TTL_MS) && DEFAULT_STATE_TTL_MS > 0
? DEFAULT_STATE_TTL_MS
: 10 * 60 * 1000;
for (const [stateId, entry] of this.states.entries()) {
if (now - entry.updatedAt > ttl) {
this.states.delete(stateId);
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
}
}
}
private latestKey(sessionId: string, appKey: string): string {
return `${sessionId}:${appKey}`;
}
}
export const semanticSessionStore = new SemanticSessionStore();

View File

@@ -0,0 +1,17 @@
export const semanticMcpToolMap: Record<string, string> = {
computer_app_drag: 'drag',
computer_click_element: 'click',
computer_get_app_state: 'get_app_state',
computer_list_apps: 'list_apps',
computer_perform_secondary_action: 'perform_secondary_action',
computer_press_key: 'press_key',
computer_scroll_element: 'scroll',
computer_set_value: 'set_value',
computer_type_text: 'type_text',
};
export const semanticOperationNames = new Set(Object.values(semanticMcpToolMap));
export function semanticOperationForMcpTool(toolName: string): string | null {
return semanticMcpToolMap[toolName] || null;
}

View File

@@ -0,0 +1,58 @@
import type { DisplaySize, Point } from '@/modules/computer-use/computer-executor.js';
export type SemanticBounds = {
x: number;
y: number;
width: number;
height: number;
};
export type SemanticApp = {
id?: string;
name: string;
bundleIdentifier?: string;
processName?: string;
pid?: number;
running: boolean;
windowTitle?: string;
};
export type SemanticElement = {
index: string;
role: string;
title?: string;
value?: string;
description?: string;
enabled?: boolean;
focused?: boolean;
selected?: boolean;
bounds?: SemanticBounds;
actions?: string[];
settableValue?: boolean;
};
export type SemanticAppState = {
stateId: string;
app: string;
platform: NodeJS.Platform;
screenshotDataUrl: string | null;
displaySize: DisplaySize | null;
elements: SemanticElement[];
accessibilityTree: SemanticElement[];
treeText?: string;
message?: string;
};
export type SemanticToolInput = Record<string, unknown> & {
sessionId?: string;
app?: string;
stateId?: string;
element_index?: string;
};
export type SemanticToolResult = SemanticAppState | {
apps: SemanticApp[];
platform: NodeJS.Platform;
};
export type SemanticActionPoint = Point;