Revert "chore: remove computer use"

This reverts commit 6761f31a56.
This commit is contained in:
Simos Mikelatos
2026-06-29 10:32:18 +00:00
parent 6761f31a56
commit 0053e99c51
57 changed files with 6298 additions and 14 deletions

View File

@@ -0,0 +1,67 @@
import {
captureScreenshot,
executor,
type ExecutorTarget,
} from '@/modules/computer-use/computer-executor.js';
import type { RawActionResult, RawComputerAction, RawActionTarget } from '@/modules/computer-use/actions/raw-action-types.js';
const DEFAULT_WAIT_MS = 1000;
const MAX_WAIT_MS = 10_000;
function normalizeWaitMs(ms: number | undefined): number {
if (ms === undefined) {
return DEFAULT_WAIT_MS;
}
if (!Number.isFinite(ms)) {
throw new Error('Computer Use wait duration must be a finite number.');
}
return Math.trunc(Math.max(0, Math.min(ms, MAX_WAIT_MS)));
}
async function snapshot(target: RawActionTarget): Promise<RawActionResult> {
const { dataUrl, size } = await captureScreenshot();
return { screenshotDataUrl: dataUrl, displaySize: size || target.displaySize };
}
export async function runRawComputerAction(
action: RawComputerAction,
target: RawActionTarget,
): Promise<RawActionResult> {
const executorTarget: ExecutorTarget = {
displaySize: target.displaySize,
};
switch (action.type) {
case 'screenshot':
return snapshot(target);
case 'cursor_position': {
const position = await executor.cursorPosition(executorTarget);
return { ...(await snapshot(target)), position, cursor: position };
}
case 'mouse_move':
await executor.moveTo(executorTarget, action.point);
return { ...(await snapshot(target)), cursor: action.point };
case 'click':
await executor.click(executorTarget, action.button, action.point, action.double === true);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'drag':
await executor.drag(executorTarget, action.from, action.to, action.button ?? 'left');
return { ...(await snapshot(target)), cursor: action.to };
case 'type':
await executor.type(action.text);
return snapshot(target);
case 'key':
await executor.pressChord(action.key);
return snapshot(target);
case 'scroll':
await executor.scroll(executorTarget, action.direction, action.amount ?? 3, action.point);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'wait':
await new Promise((resolve) => setTimeout(resolve, normalizeWaitMs(action.ms)));
return snapshot(target);
default: {
const exhaustive: never = action;
throw new Error(`Unsupported computer action: ${(exhaustive as { type?: string }).type || 'unknown'}`);
}
}
}

View File

@@ -0,0 +1,28 @@
import type {
ClickButton,
DisplaySize,
Point,
ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
export type RawComputerAction =
| { type: 'screenshot' }
| { type: 'cursor_position' }
| { type: 'mouse_move'; point: Point }
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
| { type: 'type'; text: string }
| { type: 'key'; key: string }
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
| { type: 'wait'; ms?: number };
export type RawActionTarget = {
displaySize: DisplaySize | null;
};
export type RawActionResult = {
screenshotDataUrl?: string | null;
displaySize?: DisplaySize | null;
cursor?: Point | null;
position?: Point | null;
};

View File

@@ -0,0 +1,242 @@
import { createRequire } from 'node:module';
const require = createRequire(import.meta.url);
export type Point = { x: number; y: number };
export type ClickButton = 'left' | 'right' | 'middle';
export type ScrollDirection = 'up' | 'down' | 'left' | 'right';
export type DisplaySize = { width: number; height: number };
export type RuntimeReadiness = {
nut: any | null;
screenshot: any | null;
nutInstalled: boolean;
screenshotInstalled: boolean;
};
/**
* Coordinate space the executor reports/accepts. The screenshot pixel space is
* the canonical space agents and users address; it is mapped to the nut-js
* logical mouse space before any action runs.
*/
export type ExecutorTarget = {
displaySize: DisplaySize | null;
};
export function getNut(): any | null {
try {
return require('@nut-tree-fork/nut-js');
} catch {
return null;
}
}
export function getScreenshot(): any | null {
try {
const mod = require('screenshot-desktop');
return mod?.default || mod;
} catch {
return null;
}
}
export function getRuntimeReadiness(): RuntimeReadiness {
const nut = getNut();
const screenshot = getScreenshot();
return {
nut,
screenshot,
nutInstalled: Boolean(nut),
screenshotInstalled: typeof screenshot === 'function',
};
}
/** Reads the pixel dimensions from a PNG/JPEG buffer header without decoding it. */
export function readImageSize(buffer: Buffer): DisplaySize | null {
// PNG: 8-byte signature, then IHDR chunk with width/height as big-endian uint32.
if (buffer.length >= 24 && buffer[0] === 0x89 && buffer[1] === 0x50) {
return { width: buffer.readUInt32BE(16), height: buffer.readUInt32BE(20) };
}
// JPEG: scan for a Start-Of-Frame marker (0xFFC0..0xFFCF, excluding C4/C8/CC).
if (buffer.length >= 4 && buffer[0] === 0xff && buffer[1] === 0xd8) {
let offset = 2;
while (offset + 9 < buffer.length) {
if (buffer[offset] !== 0xff) {
offset += 1;
continue;
}
const marker = buffer[offset + 1];
if (marker >= 0xc0 && marker <= 0xcf && marker !== 0xc4 && marker !== 0xc8 && marker !== 0xcc) {
return { height: buffer.readUInt16BE(offset + 5), width: buffer.readUInt16BE(offset + 7) };
}
offset += 2 + buffer.readUInt16BE(offset + 2);
}
}
return null;
}
export async function captureScreenshot(): Promise<{ dataUrl: string; size: DisplaySize | null }> {
const screenshot = getScreenshot();
if (typeof screenshot !== 'function') {
throw new Error('Computer Use runtime is not available.');
}
const buffer: Buffer = await screenshot({ format: 'png' });
return {
dataUrl: `data:image/png;base64,${buffer.toString('base64')}`,
size: readImageSize(buffer),
};
}
/** Returns the mouse coordinate space size (logical screen pixels). */
export async function getMouseSpaceSize(): Promise<DisplaySize> {
const nut = getNut();
if (!nut) {
throw new Error('Computer Use runtime is not available.');
}
const width = await nut.screen.width();
const height = await nut.screen.height();
return { width, height };
}
/** Maps a point from screenshot/image space to the mouse coordinate space. */
export async function toMouseSpace(target: ExecutorTarget, point: Point): Promise<Point> {
const mouseSize = await getMouseSpaceSize();
const image = target.displaySize || mouseSize;
const scaleX = image.width ? mouseSize.width / image.width : 1;
const scaleY = image.height ? mouseSize.height / image.height : 1;
return {
x: Math.round(point.x * scaleX),
y: Math.round(point.y * scaleY),
};
}
/** Maps a point from the mouse coordinate space back to screenshot/image space. */
export function toImageSpace(target: ExecutorTarget, point: Point, mouseSize: DisplaySize): Point {
const image = target.displaySize || mouseSize;
const scaleX = mouseSize.width ? image.width / mouseSize.width : 1;
const scaleY = mouseSize.height ? image.height / mouseSize.height : 1;
return {
x: Math.round(point.x * scaleX),
y: Math.round(point.y * scaleY),
};
}
function nutButton(nut: any, button: ClickButton) {
if (button === 'right') return nut.Button.RIGHT;
if (button === 'middle') return nut.Button.MIDDLE;
return nut.Button.LEFT;
}
/** Maps a key name (xdotool-style, as Anthropic's computer tool emits) to a nut-js Key. */
function nutKey(nut: any, token: string): any {
const map: Record<string, string> = {
return: 'Enter', enter: 'Enter', esc: 'Escape', escape: 'Escape', tab: 'Tab',
space: 'Space', backspace: 'Backspace', delete: 'Delete', del: 'Delete', insert: 'Insert',
up: 'Up', down: 'Down', left: 'Left', right: 'Right',
home: 'Home', end: 'End', pageup: 'PageUp', page_up: 'PageUp', pagedown: 'PageDown', page_down: 'PageDown',
ctrl: 'LeftControl', control: 'LeftControl', alt: 'LeftAlt', shift: 'LeftShift',
meta: 'LeftSuper', super: 'LeftSuper', cmd: 'LeftSuper', win: 'LeftSuper',
capslock: 'CapsLock',
};
const lower = token.toLowerCase();
if (map[lower]) {
return nut.Key[map[lower]];
}
if (/^f([1-9]|1[0-9]|2[0-4])$/.test(lower)) {
return nut.Key[`F${lower.slice(1)}`];
}
if (token.length === 1) {
const upper = token.toUpperCase();
if (nut.Key[upper] !== undefined) {
return nut.Key[upper];
}
if (nut.Key[`Num${token}`] !== undefined && /[0-9]/.test(token)) {
return nut.Key[`Num${token}`];
}
}
throw new Error(`Unsupported key: ${token}`);
}
/**
* The cross-platform OS executor. It is intentionally free of any server,
* database, or session dependencies so it can run both inside the local server
* process (OSS mode) and inside the standalone desktop agent (cloud relay).
*/
export const executor = {
async configure() {
const nut = getNut();
if (nut) {
// Make actions responsive; the agent loop already paces itself with screenshots.
nut.mouse.config.autoDelayMs = 2;
nut.keyboard.config.autoDelayMs = 2;
}
return nut;
},
async cursorPosition(target: ExecutorTarget): Promise<Point> {
const nut = await this.configure();
const mouseSize = await getMouseSpaceSize();
const pos = await nut.mouse.getPosition();
return toImageSpace(target, { x: pos.x, y: pos.y }, mouseSize);
},
async moveTo(target: ExecutorTarget, point: Point): Promise<void> {
const nut = await this.configure();
const dest = await toMouseSpace(target, point);
await nut.mouse.setPosition(new nut.Point(dest.x, dest.y));
},
async click(target: ExecutorTarget, button: ClickButton, point?: Point, doubleClick = false): Promise<void> {
const nut = await this.configure();
if (point) {
await this.moveTo(target, point);
}
if (doubleClick) {
await nut.mouse.doubleClick(nutButton(nut, button));
} else {
await nut.mouse.click(nutButton(nut, button));
}
},
async drag(target: ExecutorTarget, from: Point, to: Point, button: ClickButton = 'left'): Promise<void> {
const nut = await this.configure();
const start = await toMouseSpace(target, from);
const end = await toMouseSpace(target, to);
await nut.mouse.setPosition(new nut.Point(start.x, start.y));
await nut.mouse.pressButton(nutButton(nut, button));
await nut.mouse.setPosition(new nut.Point(end.x, end.y));
await nut.mouse.releaseButton(nutButton(nut, button));
},
async type(text: string): Promise<void> {
const nut = await this.configure();
await nut.keyboard.type(text);
},
async pressChord(chord: string): Promise<void> {
const nut = await this.configure();
const tokens = chord.split('+').map((token) => token.trim()).filter(Boolean);
if (tokens.length === 0) {
return;
}
const keys = tokens.map((token) => nutKey(nut, token));
for (const key of keys) {
await nut.keyboard.pressKey(key);
}
for (const key of [...keys].reverse()) {
await nut.keyboard.releaseKey(key);
}
},
async scroll(target: ExecutorTarget, direction: ScrollDirection, amount: number, point?: Point): Promise<void> {
const nut = await this.configure();
if (point) {
await this.moveTo(target, point);
}
const steps = Math.max(1, Math.round(amount));
if (direction === 'up') await nut.mouse.scrollUp(steps);
else if (direction === 'down') await nut.mouse.scrollDown(steps);
else if (direction === 'left') await nut.mouse.scrollLeft(steps);
else await nut.mouse.scrollRight(steps);
},
};

View File

@@ -0,0 +1,460 @@
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import {
captureScreenshot,
executor,
type ClickButton,
type ExecutorTarget,
type Point,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import type { SemanticAdapter } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import { createMacOsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/macos/macos-semantic-adapter.js';
import { createWindowsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/windows/windows-semantic-adapter.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import { semanticSessionStore } from '@/modules/computer-use/semantics/semantic-session-store.js';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const execFileAsync = promisify(execFile);
const MAX_APP_STATE_ELEMENTS = 250;
let helperAdapter: SemanticAdapter | null | undefined;
function readString(value: unknown): string {
return typeof value === 'string' ? value.trim() : '';
}
function requireApp(input: Record<string, unknown>): string {
const app = readString(input.app);
if (!app) {
throw new Error('app is required.');
}
return app;
}
function readNumber(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
}
function readButton(value: unknown): ClickButton {
return value === 'right' || value === 'middle' ? value : 'left';
}
function readClickCount(value: unknown): number {
const count = readNumber(value);
if (count === undefined) {
return 1;
}
return Math.max(1, Math.min(5, Math.trunc(count)));
}
function readDirection(value: unknown): ScrollDirection {
return value === 'up' || value === 'left' || value === 'right' ? value : 'down';
}
function readSessionId(input: Record<string, unknown>): string {
return readString(input.sessionId) || 'default';
}
function centerOf(element: SemanticElement): Point | null {
const bounds = element.bounds;
if (!bounds) {
return null;
}
return {
x: Math.round(bounds.x + bounds.width / 2),
y: Math.round(bounds.y + bounds.height / 2),
};
}
function getCachedElement(sessionId: string, app: string, index: string, stateId?: string): SemanticElement | null {
return semanticSessionStore.getElement(sessionId, app, index, stateId);
}
function getPoint(input: Record<string, unknown>, sessionId: string, app: string): Point | undefined {
const x = readNumber(input.x);
const y = readNumber(input.y);
if (x !== undefined && y !== undefined) {
return { x, y };
}
const elementIndex = readString(input.element_index);
if (!elementIndex) {
return undefined;
}
const element = getCachedElement(sessionId, app, elementIndex, readString(input.stateId) || undefined);
return element ? centerOf(element) || undefined : undefined;
}
function getHelperAdapter(): SemanticAdapter | null {
if (helperAdapter !== undefined) {
return helperAdapter;
}
if (process.platform !== 'darwin' && process.platform !== 'win32') {
helperAdapter = null;
return helperAdapter;
}
const resolution = resolveSemanticHelper();
if (!resolution.available) {
helperAdapter = null;
return helperAdapter;
}
helperAdapter = process.platform === 'darwin'
? createMacOsSemanticAdapter()
: createWindowsSemanticAdapter();
return helperAdapter;
}
function shouldFallbackFromHelper(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /not implemented|unavailable|not found|does not exist|timed out|not running|exited with code|failed to start/i.test(message);
}
async function withHelperState(
sessionId: string,
operation: (adapter: SemanticAdapter) => Promise<SemanticAppState>,
): Promise<SemanticAppState | null> {
const adapter = getHelperAdapter();
if (!adapter) {
return null;
}
try {
return semanticSessionStore.save(sessionId, await operation(adapter));
} catch (error) {
if (shouldFallbackFromHelper(error)) {
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
return null;
}
throw error;
}
}
async function run(command: string, args: string[], timeout = 5000): Promise<string> {
const { stdout } = await execFileAsync(command, args, {
timeout,
windowsHide: true,
maxBuffer: 1024 * 1024 * 4,
});
return stdout;
}
async function listMacApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'tell application "System Events"',
'set appRows to {}',
'repeat with p in (application processes whose background only is false)',
'set end of appRows to (name of p as text)',
'end repeat',
'return appRows',
'end tell',
].join('\n');
const output = await run('osascript', ['-e', script]);
return output.split(', ')
.map((name) => name.trim())
.filter(Boolean)
.map((name) => ({ name, running: true }));
}
async function listWindowsApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'Get-Process | Where-Object { $_.MainWindowTitle } |',
'Select-Object ProcessName, Id, MainWindowTitle | ConvertTo-Json -Depth 3',
].join(' ');
const output = await run('powershell.exe', ['-NoProfile', '-Command', script]);
const parsed = JSON.parse(output || '[]');
const rows = Array.isArray(parsed) ? parsed : [parsed];
return rows.map((row) => ({
name: row.ProcessName,
pid: row.Id,
windowTitle: row.MainWindowTitle,
running: true,
}));
}
async function listLinuxApps(): Promise<Array<Record<string, unknown>>> {
try {
const output = await run('wmctrl', ['-lx']);
return output.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.map((line) => {
const parts = line.split(/\s+/);
return {
windowId: parts[0],
desktop: parts[1],
host: parts[2],
className: parts[3],
windowTitle: parts.slice(4).join(' '),
running: true,
};
});
} catch {
const output = await run('ps', ['-eo', 'comm=']);
return [...new Set(output.split(/\r?\n/).map((name) => name.trim()).filter(Boolean))]
.slice(0, 200)
.map((name) => ({ name, running: true }));
}
}
async function listApps(): Promise<Array<Record<string, unknown>>> {
if (process.platform === 'darwin') {
return listMacApps();
}
if (process.platform === 'win32') {
return listWindowsApps();
}
return listLinuxApps();
}
async function macAccessibilityTree(app: string): Promise<SemanticElement[]> {
const escapedApp = app.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
const script = `
on safeText(v)
try
return v as text
on error
return ""
end try
end safeText
on emitElement(e, depth, maxDepth, counter)
if depth > maxDepth then return {}
set rows to {}
try
set roleText to my safeText(role of e)
on error
set roleText to "element"
end try
try
set titleText to my safeText(title of e)
on error
set titleText to ""
end try
try
set valueText to my safeText(value of e)
on error
set valueText to ""
end try
try
set posValue to position of e
set sizeValue to size of e
set boundsText to ((item 1 of posValue) as text) & "," & ((item 2 of posValue) as text) & "," & ((item 1 of sizeValue) as text) & "," & ((item 2 of sizeValue) as text)
on error
set boundsText to ""
end try
set end of rows to ((counter as text) & tab & roleText & tab & titleText & tab & valueText & tab & boundsText)
if counter > ${MAX_APP_STATE_ELEMENTS} then return rows
try
repeat with childElement in UI elements of e
set childRows to my emitElement(childElement, depth + 1, maxDepth, counter + (count of rows))
set rows to rows & childRows
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then return rows
end repeat
end try
return rows
end emitElement
tell application "System Events"
if not (exists process "${escapedApp}") then error "App is not running: ${escapedApp}"
tell process "${escapedApp}"
set rows to {}
repeat with w in windows
set rows to rows & my emitElement(w, 0, 4, (count of rows) + 1)
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then exit repeat
end repeat
return rows
end tell
end tell
`;
const output = await run('osascript', ['-e', script], 10000);
return output.split(/\r?\n|, /)
.map((line) => line.trim())
.filter(Boolean)
.map((line, index) => {
const [rawIndex, role, title, value, boundsText] = line.split('\t');
const boundsParts = (boundsText || '').split(',').map((part) => Number.parseFloat(part));
const hasBounds = boundsParts.length === 4 && boundsParts.every(Number.isFinite);
return {
index: rawIndex || String(index + 1),
role: role || 'element',
title: title || undefined,
value: value || undefined,
bounds: hasBounds
? { x: boundsParts[0], y: boundsParts[1], width: boundsParts[2], height: boundsParts[3] }
: undefined,
};
});
}
async function getAccessibilityTree(app: string): Promise<{ elements: SemanticElement[]; message?: string }> {
if (process.platform === 'darwin') {
try {
return { elements: await macAccessibilityTree(app) };
} catch (error) {
return { elements: [], message: error instanceof Error ? error.message : String(error) };
}
}
return {
elements: [],
message: 'Native accessibility tree capture is not implemented for this platform yet.',
};
}
async function getAppState(sessionId: string, app: string): Promise<SemanticAppState> {
if (!app) {
throw new Error('app is required.');
}
const helperState = await withHelperState(sessionId, (adapter) => adapter.getAppState({ sessionId, app }));
if (helperState) {
return helperState;
}
const screenshot = await captureScreenshot();
const tree = await getAccessibilityTree(app);
const state: SemanticAppState = {
stateId: semanticSessionStore.createStateId(),
app,
platform: process.platform,
screenshotDataUrl: screenshot.dataUrl,
displaySize: screenshot.size,
elements: tree.elements,
accessibilityTree: tree.elements,
message: tree.message,
};
return semanticSessionStore.save(sessionId, state);
}
async function targetFor(sessionId: string, app: string, stateId?: string): Promise<ExecutorTarget> {
const cached = semanticSessionStore.getState(sessionId, app, stateId);
return { displaySize: cached?.displaySize || (await captureScreenshot()).size };
}
export const computerSemanticsService = {
async callTool(name: string, input: Record<string, unknown>): Promise<unknown> {
const sessionId = readSessionId(input);
switch (name) {
case 'list_apps': {
const adapter = getHelperAdapter();
if (adapter) {
try {
return { apps: await adapter.listApps(), platform: process.platform };
} catch (error) {
if (!shouldFallbackFromHelper(error)) {
throw error;
}
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
}
}
return { apps: await listApps(), platform: process.platform };
}
case 'get_app_state':
return getAppState(sessionId, readString(input.app));
case 'click':
case 'click_element': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.clickElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('click requires x/y or an element_index from computer_get_app_state.');
}
const target = await targetFor(sessionId, app, stateId);
const button = readButton(input.mouse_button ?? input.mouseButton);
const clickCount = readClickCount(input.click_count ?? input.clickCount);
for (let index = 0; index < clickCount; index += 1) {
await executor.click(target, button, point, false);
}
return getAppState(sessionId, app);
}
case 'drag': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.drag({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const fromX = readNumber(input.from_x);
const fromY = readNumber(input.from_y);
const toX = readNumber(input.to_x);
const toY = readNumber(input.to_y);
if (fromX === undefined || fromY === undefined || toX === undefined || toY === undefined) {
throw new Error('drag requires from_x/from_y/to_x/to_y.');
}
await executor.drag(await targetFor(sessionId, app, stateId), { x: fromX, y: fromY }, { x: toX, y: toY }, readButton(input.mouse_button ?? input.mouseButton));
return getAppState(sessionId, app);
}
case 'scroll':
case 'scroll_element': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.scrollElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('scroll requires x/y or an element_index from computer_get_app_state.');
}
await executor.scroll(await targetFor(sessionId, app, stateId), readDirection(input.direction), readNumber(input.pages) ?? 1, point);
return getAppState(sessionId, app);
}
case 'type_text': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.typeText({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.type(readString(input.text));
return getAppState(sessionId, app);
}
case 'press_key': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.pressKey({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.pressChord(readString(input.key));
return getAppState(sessionId, app);
}
case 'set_value': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.setValue({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('set_value requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'left', point, false);
await executor.pressChord(process.platform === 'darwin' ? 'cmd+a' : 'ctrl+a');
await executor.type(readString(input.value));
return getAppState(sessionId, app);
}
case 'perform_secondary_action': {
const app = requireApp(input);
const helperState = await withHelperState(sessionId, (adapter) => adapter.performSecondaryAction({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('perform_secondary_action requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'right', point, false);
return getAppState(sessionId, app);
}
default:
throw new Error(`Unknown semantic Computer Use tool: ${name}`);
}
},
};

View File

@@ -0,0 +1,141 @@
import express from 'express';
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
import { semanticOperationForMcpTool } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const router = express.Router();
function readBearerToken(header: unknown): string | null {
if (typeof header !== 'string') {
return null;
}
const trimmed = header.trim();
const scheme = 'Bearer';
if (trimmed.slice(0, scheme.length).toLowerCase() !== scheme.toLowerCase()) {
return null;
}
const separator = trimmed[scheme.length];
if (separator !== ' ' && separator !== '\t') {
return null;
}
return trimmed.slice(scheme.length + 1).trimStart() || null;
}
function toButton(value: unknown): 'left' | 'right' | 'middle' {
return value === 'right' || value === 'middle' ? value : 'left';
}
function toScrollDirection(value: unknown): 'up' | 'down' | 'left' | 'right' {
return value === 'down' || value === 'left' || value === 'right' ? value : 'up';
}
function point(input: Record<string, unknown>): { x: number; y: number } | undefined {
return typeof input.x === 'number' && typeof input.y === 'number'
? { x: input.x, y: input.y }
: undefined;
}
function requireNumber(input: Record<string, unknown>, name: string): number {
const value = input[name];
if (typeof value !== 'number' || !Number.isFinite(value)) {
throw new Error(`${name} is required and must be a finite number.`);
}
return value;
}
function requirePoint(input: Record<string, unknown>): { x: number; y: number } {
return { x: requireNumber(input, 'x'), y: requireNumber(input, 'y') };
}
function requireNamedPoint(input: Record<string, unknown>, xName: string, yName: string): { x: number; y: number } {
return { x: requireNumber(input, xName), y: requireNumber(input, yName) };
}
router.use((req, res, next) => {
const expected = computerUseService.getMcpToken();
const token = readBearerToken(req.headers.authorization) || String(req.headers['x-computer-use-mcp-token'] || '');
if (!token || token !== expected) {
res.status(401).json({ success: false, error: 'Invalid Computer Use MCP token.' });
return;
}
next();
});
router.post('/tools/:toolName', async (req, res) => {
try {
const input = (req.body && typeof req.body === 'object' ? req.body : {}) as Record<string, unknown>;
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : undefined;
const toolName = req.params.toolName;
const semanticOperation = semanticOperationForMcpTool(toolName);
let result: unknown;
if (semanticOperation) {
result = await computerUseService.callSemanticTool(semanticOperation, input);
res.json({ success: true, data: result });
return;
}
switch (toolName) {
case 'computer_screenshot':
result = await computerUseService.agentScreenshot(sessionId);
break;
case 'computer_cursor_position':
result = await computerUseService.agentCursorPosition(sessionId);
break;
case 'computer_mouse_move':
result = await computerUseService.agentMouseMove(sessionId, requirePoint(input));
break;
case 'computer_click':
result = await computerUseService.agentUnifiedClick(sessionId, {
button: toButton(input.mouseButton ?? input.mouse_button ?? input.button),
point: point(input),
clickCount: typeof input.clickCount === 'number'
? input.clickCount
: typeof input.click_count === 'number'
? input.click_count
: 1,
});
break;
case 'computer_drag': {
const from = requireNamedPoint(input, 'startX', 'startY');
const to = requireNamedPoint(input, 'endX', 'endY');
result = await computerUseService.agentDrag(sessionId, from, to, toButton(input.mouseButton ?? input.mouse_button ?? input.button));
break;
}
case 'computer_type':
result = await computerUseService.agentType(sessionId, String(input.text || ''));
break;
case 'computer_key':
result = await computerUseService.agentKey(sessionId, String(input.key || ''));
break;
case 'computer_scroll':
result = await computerUseService.agentScroll(sessionId, {
direction: toScrollDirection(input.direction),
amount: typeof input.amount === 'number' ? input.amount : undefined,
x: typeof input.x === 'number' ? input.x : undefined,
y: typeof input.y === 'number' ? input.y : undefined,
});
break;
case 'computer_wait':
result = await computerUseService.agentWait(sessionId, typeof input.timeoutMs === 'number' ? input.timeoutMs : undefined);
break;
case 'computer_close_session':
result = await computerUseService.agentStopSession(sessionId);
break;
default:
res.status(404).json({ success: false, error: `Unknown Computer Use MCP tool "${toolName}".` });
return;
}
res.json({ success: true, data: result });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Computer Use MCP tool failed.',
});
}
});
export default router;

View File

@@ -0,0 +1,211 @@
import express from 'express';
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
import { AppError } from '@/shared/utils.js';
const router = express.Router();
type AuthenticatedRequest = express.Request & {
user?: {
id?: string | number;
};
};
function requireUser(req: AuthenticatedRequest): { id: string | number } {
const userId = req.user?.id;
if (userId === undefined || userId === null || String(userId).trim() === '') {
throw new AppError('Authenticated user is required.', {
code: 'AUTHENTICATED_USER_REQUIRED',
statusCode: 401,
});
}
return { id: userId };
}
function getErrorStatusCode(error: unknown, fallbackStatusCode: number): number {
if (error instanceof AppError) {
return error.statusCode;
}
if (error && typeof error === 'object') {
const statusCode = 'statusCode' in error ? error.statusCode : 'status' in error ? error.status : undefined;
if (typeof statusCode === 'number' && Number.isInteger(statusCode) && statusCode >= 400 && statusCode <= 599) {
return statusCode;
}
}
return fallbackStatusCode;
}
function readParam(value: string | string[] | undefined): string {
return Array.isArray(value) ? value[0] || '' : value || '';
}
function toButton(value: unknown): 'left' | 'right' | 'middle' {
return value === 'right' || value === 'middle' ? value : 'left';
}
router.get('/status', async (_req, res) => {
try {
res.json({ success: true, data: await computerUseService.getStatus() });
} catch (error) {
res.status(500).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to load Computer Use status.',
});
}
});
router.get('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
res.json({ success: true, data: { settings: await computerUseService.getSettings() } });
} catch (error) {
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to load Computer Use settings.',
});
}
});
router.put('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const settings = await computerUseService.updateSettings(req.body || {});
res.json({ success: true, data: { settings } });
} catch (error) {
res.status(getErrorStatusCode(error, 400)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to save Computer Use settings.',
});
}
});
router.post('/runtime/install', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const result = await computerUseService.installRuntime();
res.status(result.success ? 200 : 500).json({
success: result.success,
data: result,
error: result.success ? undefined : result.message,
});
} catch (error) {
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to install Computer Use runtime.',
});
}
});
router.get('/sessions', async (req: AuthenticatedRequest, res) => {
try {
res.json({ success: true, data: { sessions: await computerUseService.listSessions(requireUser(req)) } });
} catch (error) {
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to list Computer Use sessions.',
});
}
});
router.post('/sessions/:sessionId/screenshot', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userScreenshot(requireUser(req), readParam(req.params.sessionId));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to capture the screen.',
});
}
});
router.post('/sessions/:sessionId/click', async (req: AuthenticatedRequest, res) => {
try {
const x = Number(req.body?.x);
const y = Number(req.body?.y);
if (!Number.isFinite(x) || !Number.isFinite(y)) {
res.status(400).json({
success: false,
error: 'Valid numeric coordinates are required.',
});
return;
}
const session = await computerUseService.userClick(requireUser(req), readParam(req.params.sessionId), {
x,
y,
button: toButton(req.body?.button),
double: req.body?.double === true,
});
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to click.',
});
}
});
router.post('/sessions/:sessionId/press-key', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userPressKey(requireUser(req), readParam(req.params.sessionId), String(req.body?.key || ''));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to send key input.',
});
}
});
router.post('/sessions/:sessionId/consent/grant', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.grantAgentAccess(requireUser(req), readParam(req.params.sessionId));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to grant control.',
});
}
});
router.post('/sessions/:sessionId/consent/revoke', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.revokeAgentAccess(requireUser(req), readParam(req.params.sessionId));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to revoke control.',
});
}
});
router.post('/sessions/:sessionId/stop', async (req: AuthenticatedRequest, res) => {
try {
const result = await computerUseService.stopSession(requireUser(req), readParam(req.params.sessionId));
res.json({ success: true, data: result });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to stop Computer Use session.',
});
}
});
router.delete('/sessions/:sessionId', async (req: AuthenticatedRequest, res) => {
try {
const result = await computerUseService.deleteSession(requireUser(req), readParam(req.params.sessionId));
res.json({ success: true, data: result });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to delete Computer Use session.',
});
}
});
export default router;

View File

@@ -0,0 +1,920 @@
import { randomBytes, randomUUID } from 'node:crypto';
import { spawn } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { appConfigDb } from '@/modules/database/index.js';
import { providerMcpService } from '@/modules/providers/index.js';
import { getModuleDir } from '@/utils/runtime-paths.js';
import {
getRuntimeReadiness as getExecutorReadiness,
type Point,
type ClickButton,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import { runRawComputerAction } from '@/modules/computer-use/actions/raw-action-dispatcher.js';
import type { RawComputerAction } from '@/modules/computer-use/actions/raw-action-types.js';
import { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';
import { computerSemanticsService } from '@/modules/computer-use/computer-semantics.service.js';
import { semanticOperationNames } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const __dirname = getModuleDir(import.meta.url);
const IS_PLATFORM = process.env.VITE_IS_PLATFORM === 'true';
const MAX_SESSIONS_PER_OWNER = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_MAX_SESSIONS_PER_OWNER || '1', 10);
const SESSION_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_SESSION_TTL_MS || String(30 * 60 * 1000), 10);
const STOPPED_SESSION_RETENTION_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_STOPPED_SESSION_RETENTION_MS || String(30 * 60 * 1000), 10);
const MAX_STORED_SESSIONS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_MAX_STORED_SESSIONS || '100', 10);
const COMPUTER_USE_SETTINGS_KEY = 'computer_use_settings';
const COMPUTER_USE_MCP_TOKEN_KEY = 'computer_use_mcp_token';
type ComputerUseRuntime = 'cloud' | 'local';
type ComputerUseSessionStatus = 'ready' | 'stopped' | 'unavailable';
type ComputerUseSession = {
id: string;
ownerId: string;
createdBy: 'user' | 'agent';
runtime: ComputerUseRuntime;
status: ComputerUseSessionStatus;
screenshotDataUrl: string | null;
createdAt: string;
updatedAt: string;
lastAction: string | null;
message: string | null;
/** Per-session consent: agents may act only while this is true. */
agentAccessEnabled: boolean;
/** Size of the captured screenshot in pixels — the coordinate space agents/users use. */
displaySize: {
width: number;
height: number;
} | null;
cursor: {
x: number;
y: number;
actor: 'agent' | 'user';
} | null;
};
type PublicComputerUseSession = Omit<ComputerUseSession, 'ownerId'>;
type ComputerUseOwner = {
id: string | number;
};
type ComputerUseSettings = {
enabled: boolean;
};
type RuntimeReadiness = {
nut: any | null;
screenshot: any | null;
nutInstalled: boolean;
screenshotInstalled: boolean;
installInProgress: boolean;
installMessage: string | null;
};
const sessions = new Map<string, ComputerUseSession>();
let installPromise: Promise<{ success: boolean; message: string }> | null = null;
let lastInstallMessage: string | null = null;
const DEFAULT_SETTINGS: ComputerUseSettings = {
enabled: false,
};
const AGENT_OWNER_ID = 'agent';
const MCP_SERVER_NAME = 'cloudcli-computer-use';
const MCP_PROVIDERS = ['claude', 'codex', 'cursor', 'gemini', 'opencode'];
function getRuntime(): ComputerUseRuntime {
return IS_PLATFORM ? 'cloud' : 'local';
}
function readSettings(): ComputerUseSettings {
try {
const raw = appConfigDb.get(COMPUTER_USE_SETTINGS_KEY);
if (!raw) {
return DEFAULT_SETTINGS;
}
const parsed = JSON.parse(raw) as Partial<ComputerUseSettings>;
return {
enabled: parsed.enabled === true,
};
} catch (error: any) {
console.warn('[Computer Use] Failed to read settings:', error?.message || error);
return DEFAULT_SETTINGS;
}
}
function writeSettings(settings: ComputerUseSettings): ComputerUseSettings {
const normalized = {
enabled: settings.enabled === true,
};
appConfigDb.set(COMPUTER_USE_SETTINGS_KEY, JSON.stringify(normalized));
return normalized;
}
function getOrCreateMcpToken(): string {
const existing = appConfigDb.get(COMPUTER_USE_MCP_TOKEN_KEY);
if (existing) {
return existing;
}
const token = randomBytes(32).toString('hex');
appConfigDb.set(COMPUTER_USE_MCP_TOKEN_KEY, token);
return token;
}
function getSetupMessage(settings: ComputerUseSettings, readiness: RuntimeReadiness): string {
if (!settings.enabled) {
return 'Computer Use is disabled in settings.';
}
if (getRuntime() === 'cloud') {
return 'Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.';
}
if (!readiness.nutInstalled || !readiness.screenshotInstalled) {
return 'Install the desktop control runtime to capture the screen and drive the mouse and keyboard.';
}
return readiness.installMessage || 'Computer Use runtime is not ready.';
}
function getMcpCommand(): { command: string; args: string[] } {
const serverDir = path.resolve(__dirname, '..', '..');
const mcpScriptPath = path.join(serverDir, 'computer-use-mcp.js');
if (fs.existsSync(mcpScriptPath)) {
return {
command: process.execPath,
args: [mcpScriptPath],
};
}
return {
command: 'cloudcli',
args: ['computer-use-mcp'],
};
}
function getMcpApiUrl(): string {
const port = process.env.SERVER_PORT || process.env.PORT || '3001';
return `http://127.0.0.1:${port}/api/computer-use-mcp`;
}
function getRuntimeReadiness(): RuntimeReadiness {
const base = getExecutorReadiness();
return {
...base,
installInProgress: Boolean(installPromise),
installMessage: lastInstallMessage,
};
}
function runCommand(command: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => {
const child = spawn(command, args, {
cwd: process.cwd(),
env: process.env,
shell: false,
stdio: ['ignore', 'pipe', 'pipe'],
});
const output: string[] = [];
child.stdout.on('data', (chunk) => output.push(String(chunk)));
child.stderr.on('data', (chunk) => output.push(String(chunk)));
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(output.join('').trim() || `${command} ${args.join(' ')} exited with code ${code}`));
});
});
}
function formatInstallError(error: unknown): string {
const message = error instanceof Error ? error.message : String(error);
if (process.platform === 'linux' && /libxtst|x11|xtst|libpng|imagemagick|scrot/i.test(message)) {
return [
'Installing the desktop control runtime needs system packages.',
'On Debian/Ubuntu run: sudo apt-get install -y libxtst-dev libpng-dev imagemagick',
'then try again.',
].join(' ');
}
return message || 'Failed to install the Computer Use runtime.';
}
function isPackagedElectronNodeRuntime(): boolean {
return process.env.ELECTRON_RUN_AS_NODE === '1' && Boolean(process.versions.electron);
}
async function installRuntime(): Promise<{ success: boolean; message: string }> {
if (installPromise) {
return installPromise;
}
const readiness = getExecutorReadiness();
if (readiness.nutInstalled && readiness.screenshotInstalled) {
lastInstallMessage = 'Computer Use runtime is available.';
return { success: true, message: lastInstallMessage };
}
if (isPackagedElectronNodeRuntime()) {
lastInstallMessage = 'Computer Use runtime was not bundled with this desktop build.';
return { success: false, message: lastInstallMessage };
}
const npmCommand = process.platform === 'win32' ? 'npm.cmd' : 'npm';
installPromise = (async () => {
try {
lastInstallMessage = 'Installing desktop control runtime…';
await runCommand(npmCommand, [
'install',
'--no-save',
'--no-package-lock',
'@nut-tree-fork/nut-js',
'screenshot-desktop',
]);
lastInstallMessage = 'Computer Use runtime installed.';
return { success: true, message: lastInstallMessage };
} catch (error) {
lastInstallMessage = formatInstallError(error);
return { success: false, message: lastInstallMessage };
}
})();
try {
return await installPromise;
} finally {
installPromise = null;
}
}
function getOwnerId(owner: ComputerUseOwner): string {
if (owner.id === undefined || owner.id === null || String(owner.id).trim() === '') {
throw new Error('Authenticated user is required.');
}
return String(owner.id);
}
function publicSession(session: ComputerUseSession): PublicComputerUseSession {
const { ownerId: _ownerId, ...publicFields } = session;
return publicFields;
}
function ownerSessions(ownerId: string): ComputerUseSession[] {
return [...sessions.values()].filter((session) => session.ownerId === ownerId);
}
function canAccessSession(ownerId: string, session: ComputerUseSession): boolean {
return session.ownerId === ownerId || session.ownerId === AGENT_OWNER_ID;
}
function normalizeSessionId(sessionId?: string | null): string | null {
if (typeof sessionId !== 'string') {
return null;
}
const trimmed = sessionId.trim();
return trimmed ? trimmed : null;
}
function findActiveAgentSession(): ComputerUseSession | null {
return ownerSessions(AGENT_OWNER_ID)
.filter((session) => session.status === 'ready')
.sort((a, b) => Date.parse(b.updatedAt) - Date.parse(a.updatedAt))[0] || null;
}
function positiveDuration(value: number, fallback: number): number {
return Number.isFinite(value) && value > 0 ? value : fallback;
}
async function expireStaleSessions(now = Date.now()): Promise<void> {
const sessionTtl = positiveDuration(SESSION_TTL_MS, 30 * 60 * 1000);
const stoppedRetention = positiveDuration(STOPPED_SESSION_RETENTION_MS, sessionTtl);
for (const [sessionId, session] of sessions.entries()) {
const updatedAt = Date.parse(session.updatedAt);
if (!Number.isFinite(updatedAt)) {
continue;
}
if (session.status === 'ready') {
if (now - updatedAt <= sessionTtl) {
continue;
}
session.status = 'stopped';
session.agentAccessEnabled = false;
session.updatedAt = new Date(now).toISOString();
session.lastAction = 'expire';
session.message = 'Computer Use session expired after inactivity.';
continue;
}
if (now - updatedAt > stoppedRetention) {
sessions.delete(sessionId);
}
}
const maxStoredSessions = Number.isFinite(MAX_STORED_SESSIONS) && MAX_STORED_SESSIONS > 0
? MAX_STORED_SESSIONS
: 100;
if (sessions.size <= maxStoredSessions) {
return;
}
const removable = [...sessions.values()]
.filter((session) => session.status !== 'ready')
.sort((a, b) => Date.parse(a.updatedAt) - Date.parse(b.updatedAt));
for (const session of removable) {
if (sessions.size <= maxStoredSessions) {
break;
}
sessions.delete(session.id);
}
}
// --- Action layer: local executor (OSS) or cloud relay to the desktop agent --
//
// Every desktop interaction goes through `performAction` / `getCursorPosition`.
// In local mode it drives the in-process nut-js executor (computer-executor.ts);
// in cloud mode it forwards the action to the linked desktop agent over
// `desktopAgentRelay` and applies the returned screenshot. The local server
// itself never touches the OS in cloud mode.
/** Shape the desktop agent returns for any relayed action. */
type RelayResult = {
screenshotDataUrl?: string | null;
displaySize?: { width: number; height: number } | null;
cursor?: { x: number; y: number } | null;
position?: Point | null;
};
function applyRelayResult(session: ComputerUseSession, result: RelayResult): void {
if (typeof result.screenshotDataUrl === 'string') {
session.screenshotDataUrl = result.screenshotDataUrl;
}
if (result.displaySize) {
session.displaySize = result.displaySize;
}
if (result.cursor) {
session.cursor = { x: result.cursor.x, y: result.cursor.y, actor: session.cursor?.actor ?? 'agent' };
}
session.updatedAt = new Date().toISOString();
}
function stripSessionArgs(args: Record<string, unknown>): Record<string, unknown> {
const { sessionId: _sessionId, ...toolArgs } = args;
return toolArgs;
}
async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
if (getRuntime() === 'cloud') {
const result = (await desktopAgentRelay.relay('screenshot', { sessionId: session.id })) as RelayResult;
applyRelayResult(session, result);
return;
}
applyRelayResult(session, await runRawComputerAction({ type: 'screenshot' }, session));
}
/** Runs one action and refreshes the session screenshot afterwards. */
async function performAction(session: ComputerUseSession, action: RawComputerAction): Promise<void> {
if (getRuntime() === 'cloud') {
const result = (await desktopAgentRelay.relay(action.type, {
...action,
sessionId: session.id,
displaySize: session.displaySize,
})) as RelayResult;
applyRelayResult(session, result);
return;
}
applyRelayResult(session, await runRawComputerAction(action, session));
}
/** Reads the current cursor position in screenshot-pixel space. */
async function getCursorPosition(session: ComputerUseSession): Promise<Point> {
if (getRuntime() === 'cloud') {
const result = (await desktopAgentRelay.relay('cursor_position', {
sessionId: session.id,
displaySize: session.displaySize,
})) as RelayResult;
applyRelayResult(session, result);
if (result.position) {
return result.position;
}
return session.cursor ? { x: session.cursor.x, y: session.cursor.y } : { x: 0, y: 0 };
}
const result = await runRawComputerAction({ type: 'cursor_position' }, session);
applyRelayResult(session, result);
return result.position || session.cursor || { x: 0, y: 0 };
}
function assertReady(session: ComputerUseSession): void {
if (session.status !== 'ready') {
throw new Error(session.message || 'Computer Use session is not available.');
}
}
function agentToolsAvailable(): boolean {
const settings = readSettings();
if (!settings.enabled) {
return false;
}
if (getRuntime() === 'cloud') {
return desktopAgentRelay.isConnected();
}
return true;
}
function assertAgentToolsAvailable(): void {
if (agentToolsAvailable()) {
return;
}
const settings = readSettings();
if (!settings.enabled) {
throw new Error('Computer Use agent tools are disabled.');
}
throw new Error(
getRuntime() === 'cloud'
? 'No desktop is linked. Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.'
: 'Computer Use agent tools are disabled.'
);
}
function stopSessions(lastAction: string, message: string): void {
for (const session of sessions.values()) {
session.status = 'stopped';
session.agentAccessEnabled = false;
session.updatedAt = new Date().toISOString();
session.lastAction = lastAction;
session.message = message;
}
}
export const computerUseService = {
async getSettings() {
return readSettings();
},
async updateSettings(settings: Partial<ComputerUseSettings>) {
const current = readSettings();
const enabled = typeof settings.enabled === 'boolean' ? settings.enabled : current.enabled;
const next = writeSettings({ enabled });
if (next.enabled) {
await this.registerAgentMcp();
} else {
await this.unregisterAgentMcp();
desktopAgentRelay.disconnectAll('Computer Use was disabled in this environment.');
stopSessions('settings:disabled', 'Computer Use was disabled in settings.');
}
return next;
},
async getStatus() {
const settings = readSettings();
const readiness = getRuntimeReadiness();
const isCloud = getRuntime() === 'cloud';
const runtimeReady = readiness.nutInstalled && readiness.screenshotInstalled;
// Cloud mode still respects the saved feature setting. When enabled, cloud
// availability comes from a linked desktop agent because the hosted server
// has no screen of its own.
const desktopAgentConnected = desktopAgentRelay.isConnected();
const available = settings.enabled && (isCloud
? desktopAgentConnected
: runtimeReady);
return {
enabled: settings.enabled,
runtime: getRuntime(),
available,
desktopAgentConnected,
desktopAgentCount: desktopAgentRelay.connectedCount(),
nutInstalled: readiness.nutInstalled,
screenshotInstalled: readiness.screenshotInstalled,
installInProgress: readiness.installInProgress,
sessionCount: sessions.size,
message: available ? 'Computer Use runtime is available.' : getSetupMessage(settings, readiness),
};
},
async registerAgentMcp() {
const { command, args } = getMcpCommand();
const results = await providerMcpService.addMcpServerToAllProviders({
name: MCP_SERVER_NAME,
scope: 'user',
transport: 'stdio',
command,
args,
env: {
CLOUDCLI_COMPUTER_USE_MCP_TOKEN: getOrCreateMcpToken(),
CLOUDCLI_COMPUTER_USE_API_URL: getMcpApiUrl(),
},
});
return { name: MCP_SERVER_NAME, command, args, results };
},
getMcpToken() {
return getOrCreateMcpToken();
},
async unregisterAgentMcp() {
const results = await Promise.all(MCP_PROVIDERS.map(async (provider) => {
try {
const result = await providerMcpService.removeProviderMcpServer(provider, {
name: MCP_SERVER_NAME,
scope: 'user',
});
return { provider, removed: result.removed };
} catch (error) {
return {
provider,
removed: false,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}));
return { name: MCP_SERVER_NAME, results };
},
async installRuntime() {
const result = await installRuntime();
return {
...result,
status: await this.getStatus(),
};
},
async listSessions(owner: ComputerUseOwner) {
const ownerId = getOwnerId(owner);
await expireStaleSessions();
return [...sessions.values()]
.filter((session) => canAccessSession(ownerId, session))
.map(publicSession);
},
async createSession(owner: ComputerUseOwner, options?: { createdBy?: 'user' | 'agent' }) {
const ownerId = getOwnerId(owner);
await expireStaleSessions();
const createdBy = options?.createdBy ?? 'user';
const now = new Date().toISOString();
const session: ComputerUseSession = {
id: randomUUID(),
ownerId,
createdBy,
runtime: getRuntime(),
status: 'unavailable',
screenshotDataUrl: null,
createdAt: now,
updatedAt: now,
lastAction: 'create',
// Consent is always OFF at creation — the user must explicitly grant control,
// even for agent-initiated sessions controlling the full desktop.
agentAccessEnabled: false,
displaySize: null,
message: null,
cursor: null,
};
const activeOwnerSessions = ownerSessions(ownerId).filter((item) => item.status === 'ready');
if (activeOwnerSessions.length >= MAX_SESSIONS_PER_OWNER) {
throw new Error(`Computer Use is limited to ${MAX_SESSIONS_PER_OWNER} active session(s).`);
}
const settings = readSettings();
const readiness = getRuntimeReadiness();
const isCloud = getRuntime() === 'cloud';
const runtimeReady = readiness.nutInstalled && readiness.screenshotInstalled;
const ready = settings.enabled && (isCloud
? desktopAgentRelay.isConnected()
: runtimeReady);
if (!ready) {
session.message = getSetupMessage(settings, readiness);
sessions.set(session.id, session);
return publicSession(session);
}
// In cloud mode the linked desktop agent is the consent authority and prompts
// the user per its own consent mode, so the relay is allowed to act. In local
// mode the user must still grant control from the panel.
if (isCloud) {
session.agentAccessEnabled = true;
}
session.status = 'ready';
session.message = isCloud
? 'Computer Use session is ready on the linked desktop.'
: 'Computer Use session is ready. Grant control to let agents act.';
sessions.set(session.id, session);
try {
await refreshScreenshot(session);
} catch (error) {
session.status = 'unavailable';
session.message = error instanceof Error ? error.message : 'Failed to capture the screen.';
}
return publicSession(session);
},
async grantAgentAccess(owner: ComputerUseOwner, sessionId: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
session.agentAccessEnabled = true;
session.updatedAt = new Date().toISOString();
session.lastAction = 'consent:grant';
return publicSession(session);
},
async revokeAgentAccess(owner: ComputerUseOwner, sessionId: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
session.agentAccessEnabled = false;
session.updatedAt = new Date().toISOString();
session.lastAction = 'consent:revoke';
return publicSession(session);
},
async stopSession(owner: ComputerUseOwner, sessionId: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
return { stopped: false };
}
session.status = 'stopped';
session.agentAccessEnabled = false;
session.updatedAt = new Date().toISOString();
session.lastAction = 'stop';
session.message = 'Computer Use session stopped. Agent control is revoked.';
if (getRuntime() === 'cloud' && desktopAgentRelay.isConnected()) {
// Best-effort: tell the desktop agent to forget this session's consent.
void desktopAgentRelay.relay('stop_session', { sessionId }).catch(() => undefined);
}
return { stopped: true, session: publicSession(session) };
},
async deleteSession(owner: ComputerUseOwner, sessionId: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
return { deleted: false };
}
sessions.delete(sessionId);
return { deleted: true, sessionId };
},
// --- User-initiated actions (from the panel) -------------------------------
async userScreenshot(owner: ComputerUseOwner, sessionId: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
assertReady(session);
await refreshScreenshot(session);
session.lastAction = 'screenshot';
return publicSession(session);
},
async userClick(owner: ComputerUseOwner, sessionId: string, input: { x: number; y: number; button?: ClickButton; double?: boolean }) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
assertReady(session);
await performAction(session, {
type: 'click',
button: input.button || 'left',
point: { x: input.x, y: input.y },
double: input.double === true,
});
session.cursor = { x: input.x, y: input.y, actor: 'user' };
session.lastAction = input.double ? 'double_click' : 'click';
return publicSession(session);
},
async userPressKey(owner: ComputerUseOwner, sessionId: string, key: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
assertReady(session);
await performAction(session, { type: 'key', key });
session.lastAction = `key:${key}`;
return publicSession(session);
},
// --- Agent-initiated actions (via MCP) ------------------------------------
/**
* Resolves a session the agent is allowed to act on. In local mode this
* enforces the in-process per-session consent flag. In cloud mode the linked
* desktop agent is the consent authority (it prompts the user per its own
* consent mode), so this only requires the relay to be connected.
*/
async getOrCreateAgentSession(): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
await expireStaleSessions();
const existing = findActiveAgentSession();
if (existing) {
return existing;
}
const created = await this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
const session = sessions.get(created.id);
if (!session) {
throw new Error('Computer Use session could not be created.');
}
return session;
},
async getConsentedSession(sessionId?: string): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
const normalizedSessionId = normalizeSessionId(sessionId);
const session = normalizedSessionId
? sessions.get(normalizedSessionId)
: await this.getOrCreateAgentSession();
if (!session) {
throw new Error('Computer Use session not found.');
}
if (getRuntime() !== 'cloud' && !session.agentAccessEnabled) {
throw new Error(`Computer Use session ${session.id} is awaiting user consent. Ask the user to grant control in the Computer panel.`);
}
assertReady(session);
return session;
},
async agentScreenshot(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
await refreshScreenshot(session);
session.lastAction = 'screenshot';
return publicSession(session);
},
async agentCursorPosition(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
const point = await getCursorPosition(session);
session.cursor = { ...point, actor: 'agent' };
session.lastAction = 'cursor_position';
return { session: publicSession(session), position: point };
},
async agentMouseMove(sessionId: string | undefined, point: Point) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'mouse_move', point });
session.cursor = { ...point, actor: 'agent' };
session.lastAction = 'mouse_move';
return publicSession(session);
},
async agentUnifiedClick(sessionId: string | undefined, input: { button?: ClickButton; point?: Point; clickCount?: number }) {
const session = await this.getConsentedSession(sessionId);
const button = input.button || 'left';
const clickCount = Math.max(1, Math.min(Math.trunc(input.clickCount || 1), 5));
for (let index = 0; index < clickCount; index += 1) {
await performAction(session, { type: 'click', button, point: input.point, double: false });
}
if (input.point) {
session.cursor = { ...input.point, actor: 'agent' };
}
session.lastAction = clickCount > 1 ? `${button}_click:${clickCount}` : `${button}_click`;
return publicSession(session);
},
async agentDrag(sessionId: string | undefined, from: Point, to: Point, button: ClickButton = 'left') {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'drag', from, to, button });
session.cursor = { ...to, actor: 'agent' };
session.lastAction = `${button}_drag`;
return publicSession(session);
},
async agentType(sessionId: string | undefined, text: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'type', text });
session.lastAction = 'type';
return publicSession(session);
},
async agentKey(sessionId: string | undefined, key: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'key', key });
session.lastAction = `key:${key}`;
return publicSession(session);
},
async agentScroll(sessionId: string | undefined, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
const session = await this.getConsentedSession(sessionId);
const point = typeof input.x === 'number' && typeof input.y === 'number' ? { x: input.x, y: input.y } : undefined;
await performAction(session, { type: 'scroll', direction: input.direction, amount: input.amount, point });
if (point) {
session.cursor = { ...point, actor: 'agent' };
}
session.lastAction = `scroll:${input.direction}`;
return publicSession(session);
},
async agentWait(sessionId?: string, timeoutMs?: number) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'wait', ms: timeoutMs });
session.lastAction = 'wait';
return publicSession(session);
},
async agentStopSession(sessionId?: string) {
assertAgentToolsAvailable();
const normalizedSessionId = normalizeSessionId(sessionId);
if (normalizedSessionId) {
return this.stopSession({ id: AGENT_OWNER_ID }, normalizedSessionId);
}
await expireStaleSessions();
const existing = findActiveAgentSession();
if (!existing) {
return { stopped: false };
}
return this.stopSession({ id: AGENT_OWNER_ID }, existing.id);
},
async callSemanticTool(toolName: string, args: Record<string, unknown>) {
if (!semanticOperationNames.has(toolName)) {
throw new Error(`Unsupported semantic Computer Use tool: ${toolName}`);
}
const sessionId = typeof args.sessionId === 'string' ? args.sessionId : undefined;
const session = await this.getConsentedSession(normalizeSessionId(sessionId) ?? undefined);
const toolArgs = { ...stripSessionArgs(args), sessionId: session.id };
const semanticResult = getRuntime() === 'cloud'
? await desktopAgentRelay.relay('semantic_tool', {
sessionId: session.id,
displaySize: session.displaySize,
toolName,
arguments: toolArgs,
})
: await computerSemanticsService.callTool(toolName, toolArgs);
applyRelayResult(session, semanticResult as RelayResult);
session.lastAction = `semantic:${toolName}`;
return { session: publicSession(session), result: semanticResult };
},
/**
* Cloud only: when a desktop agent links to this hosted environment, expose
* the computer_* MCP tools only if the user enabled Computer Use in settings.
*/
async onDesktopAgentConnected() {
if (getRuntime() !== 'cloud') {
return;
}
if (!readSettings().enabled) {
return;
}
try {
await this.registerAgentMcp();
} catch (error) {
console.warn('[Computer Use] Failed to register MCP for linked desktop agent:', error instanceof Error ? error.message : error);
}
},
/** Cloud only: tear down sessions when the last desktop agent disconnects. */
async onDesktopAgentDisconnected() {
if (getRuntime() !== 'cloud' || desktopAgentRelay.isConnected()) {
return;
}
for (const session of sessions.values()) {
if (session.status === 'ready') {
session.status = 'stopped';
session.agentAccessEnabled = false;
session.updatedAt = new Date().toISOString();
session.lastAction = 'agent-disconnected';
session.message = 'The linked desktop agent disconnected.';
}
}
},
async stopAllSessions() {
stopSessions('shutdown', 'Computer Use session stopped during server shutdown.');
},
};
// Drive cloud MCP exposure + session teardown off desktop-agent connectivity.
desktopAgentRelay.setHooks({
canAcceptConnection: () => getRuntime() === 'cloud' && readSettings().enabled,
onFirstConnect: () => computerUseService.onDesktopAgentConnected(),
onLastDisconnect: () => computerUseService.onDesktopAgentDisconnected(),
});
process.once('beforeExit', () => {
void computerUseService.stopAllSessions();
});

View File

@@ -0,0 +1,158 @@
import { randomUUID } from 'node:crypto';
import type { WebSocket } from 'ws';
const RELAY_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_RELAY_TIMEOUT_MS || '60000', 10);
const WS_OPEN = 1;
type PendingRelay = {
resolve: (value: unknown) => void;
reject: (reason: Error) => void;
timer: ReturnType<typeof setTimeout>;
};
type ConnectedAgent = {
ws: WebSocket;
label: string;
registeredAt: string;
};
type RelayLifecycleHooks = {
canAcceptConnection?: () => boolean;
onFirstConnect?: () => void | Promise<void>;
onLastDisconnect?: () => void | Promise<void>;
};
const agents = new Map<WebSocket, ConnectedAgent>();
const pending = new Map<string, PendingRelay>();
let hooks: RelayLifecycleHooks = {};
function rejectAllPending(reason: string): void {
for (const [callId, call] of pending.entries()) {
clearTimeout(call.timer);
call.reject(new Error(reason));
pending.delete(callId);
}
}
function pickAgent(): ConnectedAgent | undefined {
for (const agent of agents.values()) {
if (agent.ws.readyState === WS_OPEN) {
return agent;
}
}
return undefined;
}
/**
* Cloud-side registry of linked desktop agents and the request/response relay
* used to drive the user's real desktop. The hosted server never touches the OS
* itself — it only forwards `computer_*` actions to a connected desktop agent
* and awaits the screenshot it returns.
*/
export const desktopAgentRelay = {
setHooks(next: RelayLifecycleHooks): void {
hooks = next;
},
register(ws: WebSocket, label = 'desktop-agent'): boolean {
if (hooks.canAcceptConnection && !hooks.canAcceptConnection()) {
console.log(`[DesktopAgent] Rejected (${label}); Computer Use is disabled.`);
try {
ws.close(1008, 'Computer Use is disabled in this environment.');
} catch {
// ignore close failures
}
return false;
}
const wasEmpty = pickAgent() === undefined;
agents.set(ws, { ws, label, registeredAt: new Date().toISOString() });
console.log(`[DesktopAgent] Registered (${label}); ${agents.size} connected.`);
ws.on('close', () => {
const wasRegistered = agents.delete(ws);
console.log(`[DesktopAgent] Disconnected (${label}); ${agents.size} remain.`);
if (wasRegistered && pickAgent() === undefined) {
rejectAllPending('Desktop agent disconnected.');
void hooks.onLastDisconnect?.();
}
});
if (wasEmpty) {
void hooks.onFirstConnect?.();
}
return true;
},
disconnectAll(reason = 'Desktop agent disconnected.'): void {
const hadAgent = pickAgent() !== undefined;
const sockets = [...agents.keys()];
agents.clear();
for (const ws of sockets) {
try {
ws.close(1008, reason);
} catch {
// ignore close failures
}
}
rejectAllPending(reason);
if (hadAgent) {
void hooks.onLastDisconnect?.();
}
},
/** Resolves a pending relay call with the desktop agent's reply. */
handleResult(id: string, result: unknown, error?: string): void {
const call = pending.get(id);
if (!call) {
return;
}
clearTimeout(call.timer);
pending.delete(id);
if (error) {
call.reject(new Error(error));
} else {
call.resolve(result);
}
},
isConnected(): boolean {
return pickAgent() !== undefined;
},
connectedCount(): number {
let count = 0;
for (const agent of agents.values()) {
if (agent.ws.readyState === WS_OPEN) {
count++;
}
}
return count;
},
async relay(type: string, params: Record<string, unknown>): Promise<unknown> {
const agent = pickAgent();
if (!agent) {
throw new Error(
'No desktop is linked. Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.'
);
}
const id = randomUUID();
return new Promise<unknown>((resolve, reject) => {
const timer = setTimeout(() => {
pending.delete(id);
reject(new Error('Desktop agent did not respond in time.'));
}, RELAY_TIMEOUT_MS);
pending.set(id, { resolve, reject, timer });
try {
agent.ws.send(JSON.stringify({ kind: 'computer_relay', id, type, params }));
} catch (error) {
clearTimeout(timer);
pending.delete(id);
reject(error instanceof Error ? error : new Error('Failed to send to desktop agent.'));
}
});
},
};

View File

@@ -0,0 +1,2 @@
export { computerUseService } from '@/modules/computer-use/computer-use.service.js';
export { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';

View File

@@ -0,0 +1,82 @@
import { SemanticHelperProcess } from '@/modules/computer-use/semantics/helpers/semantic-helper-process.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import type { SemanticAdapter, SemanticAdapterCapabilities } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
type HelperMethod =
| 'list_apps'
| 'get_app_state'
| 'click_element'
| 'perform_secondary_action'
| 'set_value'
| 'type_text'
| 'press_key'
| 'scroll_element'
| 'drag';
export class HelperSemanticAdapter implements SemanticAdapter {
private helper: SemanticHelperProcess | null = null;
constructor(
private readonly platform: NodeJS.Platform,
private readonly arch: NodeJS.Architecture = process.arch,
) {}
capabilities(): SemanticAdapterCapabilities {
return {
platform: this.platform,
appDiscovery: true,
accessibilityTree: true,
nativeElementActions: true,
nativeValueSetting: true,
targetedInput: true,
};
}
async listApps(): Promise<SemanticApp[]> {
return await this.request('list_apps', {}) as SemanticApp[];
}
async getAppState(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('get_app_state', input) as SemanticAppState;
}
async clickElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('click_element', input) as SemanticAppState;
}
async performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('perform_secondary_action', input) as SemanticAppState;
}
async setValue(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('set_value', input) as SemanticAppState;
}
async typeText(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('type_text', input) as SemanticAppState;
}
async pressKey(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('press_key', input) as SemanticAppState;
}
async scrollElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('scroll_element', input) as SemanticAppState;
}
async drag(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('drag', input) as SemanticAppState;
}
private async request(method: HelperMethod, params: Record<string, unknown>): Promise<unknown> {
if (!this.helper) {
const resolution = resolveSemanticHelper(this.platform, this.arch);
if (!resolution.available || !resolution.path) {
throw new Error(resolution.reason || `Semantic helper is unavailable for ${this.platform}-${this.arch}.`);
}
this.helper = new SemanticHelperProcess(resolution.path);
}
return this.helper.request(method, params);
}
}

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createMacOsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('darwin');
}

View File

@@ -0,0 +1,23 @@
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
export type SemanticAdapterCapabilities = {
platform: NodeJS.Platform;
appDiscovery: boolean;
accessibilityTree: boolean;
nativeElementActions: boolean;
nativeValueSetting: boolean;
targetedInput: boolean;
};
export type SemanticAdapter = {
capabilities(): SemanticAdapterCapabilities;
listApps(): Promise<SemanticApp[]>;
getAppState(input: SemanticToolInput): Promise<SemanticAppState>;
clickElement(input: SemanticToolInput): Promise<SemanticAppState>;
performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState>;
setValue(input: SemanticToolInput): Promise<SemanticAppState>;
typeText(input: SemanticToolInput): Promise<SemanticAppState>;
pressKey(input: SemanticToolInput): Promise<SemanticAppState>;
scrollElement(input: SemanticToolInput): Promise<SemanticAppState>;
drag(input: SemanticToolInput): Promise<SemanticAppState>;
};

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createWindowsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('win32');
}

View File

@@ -0,0 +1,467 @@
import AppKit
import ApplicationServices
import Foundation
typealias JSON = [String: Any]
struct ElementRecord {
let index: String
let role: String
let title: String?
let value: String?
let bounds: [String: Double]?
let actions: [String]
}
var stateElements: [String: [ElementRecord]] = [:]
var stateAxElements: [String: [String: AXUIElement]] = [:]
var stateOrder: [String] = []
let maxStoredStates = 100
func jsonLine(_ object: Any) {
guard JSONSerialization.isValidJSONObject(object),
let data = try? JSONSerialization.data(withJSONObject: object),
let text = String(data: data, encoding: .utf8)
else {
print("{\"error\":\"Failed to encode JSON\"}")
fflush(stdout)
return
}
print(text)
fflush(stdout)
}
func respond(id: Any?, result: Any) {
jsonLine(["id": id ?? NSNull(), "result": result])
}
func respondError(id: Any?, _ message: String) {
jsonLine(["id": id ?? NSNull(), "error": message])
}
func stringAttr(_ element: AXUIElement, _ attr: CFString) -> String? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? String
}
func boolAttr(_ element: AXUIElement, _ attr: CFString) -> Bool? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? Bool
}
func arrayAttr(_ element: AXUIElement, _ attr: CFString) -> [AXUIElement] {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return [] }
return value as? [AXUIElement] ?? []
}
func actions(_ element: AXUIElement) -> [String] {
var names: CFArray?
guard AXUIElementCopyActionNames(element, &names) == .success else { return [] }
return names as? [String] ?? []
}
func bounds(_ element: AXUIElement) -> [String: Double]? {
var positionRef: CFTypeRef?
var sizeRef: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionRef) == .success,
AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeRef) == .success,
let positionValue = positionRef,
let sizeValue = sizeRef
else { return nil }
var point = CGPoint.zero
var size = CGSize.zero
guard CFGetTypeID(positionValue) == AXValueGetTypeID(),
CFGetTypeID(sizeValue) == AXValueGetTypeID()
else { return nil }
let positionAxValue = positionValue as! AXValue
let sizeAxValue = sizeValue as! AXValue
guard AXValueGetValue(positionAxValue, .cgPoint, &point),
AXValueGetValue(sizeAxValue, .cgSize, &size)
else { return nil }
return [
"x": Double(point.x),
"y": Double(point.y),
"width": Double(size.width),
"height": Double(size.height),
]
}
func record(_ element: AXUIElement, index: String) -> ElementRecord {
ElementRecord(
index: index,
role: stringAttr(element, kAXRoleAttribute as CFString) ?? "AXUnknown",
title: stringAttr(element, kAXTitleAttribute as CFString) ?? stringAttr(element, kAXDescriptionAttribute as CFString),
value: stringAttr(element, kAXValueAttribute as CFString),
bounds: bounds(element),
actions: actions(element)
)
}
func cachedElement(_ params: JSON) -> AXUIElement? {
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String
else {
return nil
}
return stateAxElements[stateId]?[elementIndex]
}
func dictionary(_ record: ElementRecord) -> JSON {
var output: JSON = [
"index": record.index,
"role": record.role,
"actions": record.actions,
]
if let title = record.title { output["title"] = title }
if let value = record.value { output["value"] = value }
if let bounds = record.bounds { output["bounds"] = bounds }
return output
}
func pruneStoredStates() {
while stateOrder.count > maxStoredStates {
let evicted = stateOrder.removeFirst()
stateElements.removeValue(forKey: evicted)
stateAxElements.removeValue(forKey: evicted)
}
}
func resolveApp(_ query: String) throws -> NSRunningApplication {
let normalized = query.lowercased()
let apps = NSWorkspace.shared.runningApplications.filter { app in
app.activationPolicy == .regular
}
if let app = apps.first(where: { $0.bundleIdentifier?.lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased().contains(normalized) }) {
return app
}
throw NSError(domain: "CloudCLISemantics", code: 404, userInfo: [NSLocalizedDescriptionKey: "App is not running: \(query)"])
}
func listApps() -> [[String: Any]] {
NSWorkspace.shared.runningApplications
.filter { $0.activationPolicy == .regular }
.map { app in
[
"id": app.bundleIdentifier ?? app.localizedName ?? "\(app.processIdentifier)",
"name": app.localizedName ?? app.bundleIdentifier ?? "Unknown",
"bundleIdentifier": app.bundleIdentifier ?? "",
"pid": Int(app.processIdentifier),
"running": true,
]
}
}
func walk(_ element: AXUIElement, depth: Int, maxDepth: Int, records: inout [ElementRecord], axRecords: inout [String: AXUIElement], limit: Int) {
if depth > maxDepth || records.count >= limit { return }
let index = "\(records.count + 1)"
records.append(record(element, index: index))
axRecords[index] = element
for child in arrayAttr(element, kAXChildrenAttribute as CFString) {
walk(child, depth: depth + 1, maxDepth: maxDepth, records: &records, axRecords: &axRecords, limit: limit)
if records.count >= limit { return }
}
}
func pngDataUrlForMainDisplay() -> String? {
let fileURL = URL(fileURLWithPath: NSTemporaryDirectory()).appendingPathComponent("cloudcli-semantics-\(UUID().uuidString).png")
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/sbin/screencapture")
process.arguments = ["-x", "-t", "png", fileURL.path]
do {
try process.run()
process.waitUntilExit()
guard process.terminationStatus == 0 else { return nil }
let png = try Data(contentsOf: fileURL)
try? FileManager.default.removeItem(at: fileURL)
return png.isEmpty ? nil : "data:image/png;base64,\(png.base64EncodedString())"
} catch {
try? FileManager.default.removeItem(at: fileURL)
return nil
}
}
func getAppState(_ params: JSON) throws -> JSON {
let appName = params["app"] as? String ?? ""
let app = try resolveApp(appName)
let axApp = AXUIElementCreateApplication(app.processIdentifier)
let windows = arrayAttr(axApp, kAXWindowsAttribute as CFString)
let root = windows.first ?? axApp
var records: [ElementRecord] = []
var axRecords: [String: AXUIElement] = [:]
walk(root, depth: 0, maxDepth: 5, records: &records, axRecords: &axRecords, limit: 300)
let stateId = "state_\(UUID().uuidString)"
stateElements[stateId] = records
stateAxElements[stateId] = axRecords
stateOrder.append(stateId)
pruneStoredStates()
let elements = records.map(dictionary)
return [
"stateId": stateId,
"app": app.localizedName ?? app.bundleIdentifier ?? appName,
"platform": "darwin",
"screenshotDataUrl": pngDataUrlForMainDisplay() ?? NSNull(),
"displaySize": [
"width": Int(CGDisplayPixelsWide(CGMainDisplayID())),
"height": Int(CGDisplayPixelsHigh(CGMainDisplayID())),
],
"elements": elements,
"accessibilityTree": elements,
"treeText": elements.map { "\($0["index"] ?? "") \($0["role"] ?? "") \($0["title"] ?? "")" }.joined(separator: "\n"),
]
}
func cgMouseButton(_ value: Any?) -> CGMouseButton {
guard let button = value as? String else { return .left }
switch button {
case "right": return .right
case "middle": return .center
default: return .left
}
}
func mouseEventTypes(_ button: CGMouseButton) -> (CGEventType, CGEventType) {
switch button {
case .right: return (.rightMouseDown, .rightMouseUp)
case .center: return (.otherMouseDown, .otherMouseUp)
default: return (.leftMouseDown, .leftMouseUp)
}
}
func postMouseClick(point: CGPoint, button: CGMouseButton, clickCount: Int = 1) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
for _ in 0..<max(1, clickCount) {
let down = CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: point, mouseButton: button)
let up = CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: point, mouseButton: button)
down?.post(tap: .cghidEventTap)
up?.post(tap: .cghidEventTap)
usleep(80_000)
}
}
func postDrag(from: CGPoint, to: CGPoint, button: CGMouseButton) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: from, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
}
func runAppleScript(_ script: String) throws {
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/bin/osascript")
process.arguments = ["-e", script]
process.standardOutput = Pipe()
let stderr = Pipe()
process.standardError = stderr
try process.run()
process.waitUntilExit()
if process.terminationStatus != 0 {
let data = stderr.fileHandleForReading.readDataToEndOfFile()
let message = String(data: data, encoding: .utf8) ?? "AppleScript failed."
throw NSError(domain: "CloudCLISemantics", code: Int(process.terminationStatus), userInfo: [NSLocalizedDescriptionKey: message])
}
}
func escapedAppleScriptString(_ value: String) -> String {
value.replacingOccurrences(of: "\\", with: "\\\\").replacingOccurrences(of: "\"", with: "\\\"")
}
func pointForElement(_ params: JSON) -> CGPoint? {
if let x = params["x"] as? Double, let y = params["y"] as? Double {
return CGPoint(x: x, y: y)
}
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String,
let element = stateElements[stateId]?.first(where: { $0.index == elementIndex }),
let b = element.bounds,
let x = b["x"], let y = b["y"], let width = b["width"], let height = b["height"]
else {
return nil
}
return CGPoint(x: x + width / 2, y: y + height / 2)
}
func click(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
cgMouseButton(params["mouse_button"]) == .left,
(params["click_count"] as? Int ?? 1) == 1,
actions(element).contains(kAXPressAction as String),
AXUIElementPerformAction(element, kAXPressAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "click_element requires x/y or stateId + element_index"])
}
let clickCount = params["click_count"] as? Int ?? 1
try postMouseClick(point: point, button: cgMouseButton(params["mouse_button"]), clickCount: clickCount)
return try getAppState(params)
}
func performSecondaryAction(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
actions(element).contains(kAXShowMenuAction as String),
AXUIElementPerformAction(element, kAXShowMenuAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "perform_secondary_action requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .right)
return try getAppState(params)
}
func setValue(_ params: JSON) throws -> JSON {
guard let value = params["value"] as? String else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires value"])
}
if let element = cachedElement(params),
AXUIElementSetAttributeValue(element, kAXValueAttribute as CFString, value as CFTypeRef) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .left)
try runAppleScript("tell application \"System Events\" to keystroke \"a\" using command down")
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(value))\"")
return try getAppState(params)
}
func typeText(_ params: JSON) throws -> JSON {
let text = params["text"] as? String ?? ""
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(text))\"")
return try getAppState(params)
}
func appleScriptModifiers(_ parts: [String]) -> String {
let modifiers = parts.compactMap { part -> String? in
switch part.lowercased() {
case "cmd", "command", "meta": return "command down"
case "ctrl", "control": return "control down"
case "alt", "option": return "option down"
case "shift": return "shift down"
default: return nil
}
}
return modifiers.isEmpty ? "" : " using {\(modifiers.joined(separator: ", "))}"
}
func appleScriptKeyCode(_ key: String) -> Int? {
switch key.lowercased() {
case "return", "enter": return 36
case "tab": return 48
case "space": return 49
case "delete", "backspace": return 51
case "escape", "esc": return 53
case "left": return 123
case "right": return 124
case "down": return 125
case "up": return 126
default: return nil
}
}
func pressKey(_ params: JSON) throws -> JSON {
let raw = params["key"] as? String ?? ""
let parts = raw.split(separator: "+").map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
let key = parts.last ?? raw
let modifiers = appleScriptModifiers(Array(parts.dropLast()))
if let keyCode = appleScriptKeyCode(key) {
try runAppleScript("tell application \"System Events\" to key code \(keyCode)\(modifiers)")
} else {
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(key))\"\(modifiers)")
}
return try getAppState(params)
}
func scrollElement(_ params: JSON) throws -> JSON {
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "scroll_element requires x/y or stateId + element_index"])
}
CGWarpMouseCursorPosition(point)
let direction = params["direction"] as? String ?? "down"
let pages = params["pages"] as? Double ?? 1.0
let amount = Int32(max(1.0, abs(pages) * 8.0))
let vertical = direction == "up" ? amount : direction == "down" ? -amount : 0
let horizontal = direction == "left" ? amount : direction == "right" ? -amount : 0
CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 2, wheel1: vertical, wheel2: horizontal, wheel3: 0)?.post(tap: .cghidEventTap)
return try getAppState(params)
}
func drag(_ params: JSON) throws -> JSON {
guard let fromX = params["from_x"] as? Double,
let fromY = params["from_y"] as? Double,
let toX = params["to_x"] as? Double,
let toY = params["to_y"] as? Double
else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "drag requires from_x/from_y/to_x/to_y"])
}
try postDrag(from: CGPoint(x: fromX, y: fromY), to: CGPoint(x: toX, y: toY), button: cgMouseButton(params["mouse_button"]))
return try getAppState(params)
}
func handle(_ request: JSON) {
let id = request["id"]
let method = request["method"] as? String ?? ""
let params = request["params"] as? JSON ?? [:]
do {
switch method {
case "list_apps":
respond(id: id, result: listApps())
case "get_app_state":
respond(id: id, result: try getAppState(params))
case "click_element":
respond(id: id, result: try click(params))
case "perform_secondary_action":
respond(id: id, result: try performSecondaryAction(params))
case "set_value":
respond(id: id, result: try setValue(params))
case "type_text":
respond(id: id, result: try typeText(params))
case "press_key":
respond(id: id, result: try pressKey(params))
case "scroll_element":
respond(id: id, result: try scrollElement(params))
case "drag":
respond(id: id, result: try drag(params))
default:
respondError(id: id, "Method is not implemented yet: \(method)")
}
} catch {
respondError(id: id, error.localizedDescription)
}
}
while let line = readLine() {
guard let data = line.data(using: .utf8),
let object = try? JSONSerialization.jsonObject(with: data),
let request = object as? JSON
else {
respondError(id: nil, "Invalid JSON request")
continue
}
handle(request)
}

View File

@@ -0,0 +1,124 @@
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import readline from 'node:readline';
type JsonRecord = Record<string, unknown>;
type PendingRequest = {
resolve: (value: unknown) => void;
reject: (error: Error) => void;
timer: ReturnType<typeof setTimeout>;
};
const DEFAULT_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_SEMANTICS_HELPER_TIMEOUT_MS || '60000', 10);
function timeoutMs(): number {
return Number.isFinite(DEFAULT_TIMEOUT_MS) && DEFAULT_TIMEOUT_MS > 0 ? DEFAULT_TIMEOUT_MS : 60000;
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
export class SemanticHelperProcess {
private child: ChildProcessWithoutNullStreams | null = null;
private reader: readline.Interface | null = null;
private nextId = 1;
private pending = new Map<number, PendingRequest>();
constructor(private readonly executablePath: string) {}
async request(method: string, params: JsonRecord): Promise<unknown> {
this.ensureStarted();
const child = this.child;
if (!child?.stdin.writable) {
throw new Error('Semantic helper process is not running.');
}
const id = this.nextId++;
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
this.pending.delete(id);
reject(new Error(`Semantic helper request timed out: ${method}`));
}, timeoutMs());
this.pending.set(id, { resolve, reject, timer });
child.stdin.write(`${JSON.stringify({ id, method, params })}\n`);
});
}
stop(): void {
const child = this.child;
this.child = null;
this.reader?.close();
this.reader = null;
this.rejectAll('Semantic helper stopped.');
if (child) {
try { child.kill('SIGTERM'); } catch { /* noop */ }
}
}
private ensureStarted(): void {
if (this.child) {
return;
}
this.child = spawn(this.executablePath, [], {
stdio: ['pipe', 'pipe', 'pipe'],
windowsHide: true,
});
this.reader = readline.createInterface({ input: this.child.stdout });
this.reader.on('line', (line) => this.handleLine(line));
this.child.stderr.on('data', (chunk) => {
const text = String(chunk).trim();
if (text) {
console.error('[SemanticHelper]', text);
}
});
this.child.once('error', (error) => {
this.child = null;
this.rejectAll(`Failed to start semantic helper: ${error.message}`);
});
this.child.once('exit', (code) => {
this.child = null;
this.rejectAll(`Semantic helper exited with code ${code ?? 'null'}.`);
});
}
private handleLine(line: string): void {
let message: JsonRecord;
try {
message = JSON.parse(line) as JsonRecord;
} catch (error) {
console.error('[SemanticHelper] Invalid JSON response:', errorMessage(error));
return;
}
const id = typeof message.id === 'number' ? message.id : null;
if (id === null) {
return;
}
const pending = this.pending.get(id);
if (!pending) {
return;
}
clearTimeout(pending.timer);
this.pending.delete(id);
if (message.error) {
pending.reject(new Error(typeof message.error === 'string' ? message.error : 'Semantic helper request failed.'));
return;
}
pending.resolve(message.result);
}
private rejectAll(reason: string): void {
for (const [id, request] of this.pending.entries()) {
clearTimeout(request.timer);
request.reject(new Error(reason));
this.pending.delete(id);
}
}
}

View File

@@ -0,0 +1,97 @@
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export type SemanticHelperPlatform = 'darwin' | 'win32';
export type SemanticHelperResolution = {
available: boolean;
path: string | null;
source: 'bundled' | 'dev' | 'missing';
platform: NodeJS.Platform;
arch: NodeJS.Architecture;
reason?: string;
};
function helperExecutableName(platform: NodeJS.Platform): string | null {
if (platform === 'darwin') {
return 'CloudCLISemantics';
}
if (platform === 'win32') {
return 'CloudCLISemantics.exe';
}
return null;
}
function pathExists(filePath: string): boolean {
try {
fs.accessSync(filePath, fs.constants.X_OK);
return true;
} catch {
try {
fs.accessSync(filePath, fs.constants.F_OK);
return true;
} catch {
return false;
}
}
}
function candidatePaths(platform: NodeJS.Platform, arch: NodeJS.Architecture): Array<{ source: 'bundled' | 'dev'; path: string }> {
const executable = helperExecutableName(platform);
if (!executable) {
return [];
}
const platformArch = `${platform}-${arch}`;
return [
{
source: 'bundled',
path: path.resolve(__dirname, '..', 'bin', platformArch, executable),
},
{
source: 'dev',
path: path.resolve(process.cwd(), 'server', 'modules', 'computer-use', 'semantics', 'bin', platformArch, executable),
},
];
}
export function resolveSemanticHelper(
platform: NodeJS.Platform = process.platform,
arch: NodeJS.Architecture = process.arch,
): SemanticHelperResolution {
const executable = helperExecutableName(platform);
if (!executable) {
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Semantic Computer Use helper is not supported on ${platform}.`,
};
}
for (const candidate of candidatePaths(platform, arch)) {
if (pathExists(candidate.path)) {
return {
available: true,
path: candidate.path,
source: candidate.source,
platform,
arch,
};
}
}
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Bundled semantic helper was not found for ${platform}-${arch} (${executable}).`,
};
}

View File

@@ -0,0 +1,11 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0-windows</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UseWindowsForms>true</UseWindowsForms>
<UseWPF>true</UseWPF>
<AssemblyName>CloudCLISemantics</AssemblyName>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,534 @@
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Runtime.InteropServices;
using System.Text.Json;
using System.Windows.Automation;
static class Program
{
private const int MaxStoredStates = 100;
private static readonly Dictionary<string, List<ElementRecord>> StateElements = new();
private static readonly Dictionary<string, Dictionary<string, AutomationElement>> StateAutomationElements = new();
private static readonly Queue<string> StateOrder = new();
public static void Main()
{
string? line;
while ((line = Console.ReadLine()) != null)
{
try
{
using var doc = JsonDocument.Parse(line);
var root = doc.RootElement;
var id = root.TryGetProperty("id", out var idValue) ? idValue.Clone() : default;
var method = root.TryGetProperty("method", out var methodValue) ? methodValue.GetString() ?? "" : "";
var parameters = root.TryGetProperty("params", out var paramsValue) && paramsValue.ValueKind == JsonValueKind.Object
? paramsValue.Clone()
: JsonDocument.Parse("{}").RootElement.Clone();
try
{
object result = method switch
{
"list_apps" => ListApps(),
"get_app_state" => GetAppState(parameters),
"click_element" => ClickElement(parameters),
"perform_secondary_action" => PerformSecondaryAction(parameters),
"set_value" => SetValue(parameters),
"type_text" => TypeText(parameters),
"press_key" => PressKey(parameters),
"scroll_element" => ScrollElement(parameters),
"drag" => Drag(parameters),
_ => throw new InvalidOperationException($"Method is not implemented yet: {method}")
};
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["result"] = result });
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["error"] = ex.Message });
}
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = null, ["error"] = $"Invalid JSON request: {ex.Message}" });
}
}
}
private static object? JsonValue(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString(),
JsonValueKind.Number => element.TryGetInt64(out var number) ? number : element.GetDouble(),
JsonValueKind.True => true,
JsonValueKind.False => false,
_ => null
};
}
private static void Write(object value)
{
Console.WriteLine(JsonSerializer.Serialize(value));
Console.Out.Flush();
}
private static List<Dictionary<string, object?>> ListApps()
{
return Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.OrderBy(process => process.ProcessName)
.Select(process => new Dictionary<string, object?>
{
["id"] = process.Id.ToString(),
["name"] = process.ProcessName,
["processName"] = process.ProcessName,
["pid"] = process.Id,
["running"] = true,
["windowTitle"] = process.MainWindowTitle
})
.ToList();
}
private static Process ResolveProcess(string query)
{
var normalized = query.Trim();
if (string.IsNullOrWhiteSpace(normalized))
{
throw new InvalidOperationException("app is required.");
}
var processes = Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.ToList();
return processes.FirstOrDefault(process => process.ProcessName.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Contains(normalized, StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"App is not running: {query}");
}
private static Dictionary<string, object?> GetAppState(JsonElement parameters)
{
var appQuery = ReadString(parameters, "app");
var process = ResolveProcess(appQuery);
var root = AutomationElement.FromHandle(process.MainWindowHandle)
?? throw new InvalidOperationException("No UI Automation root window is available.");
var records = new List<ElementRecord>();
var automationElements = new Dictionary<string, AutomationElement>();
Walk(root, records, automationElements, 0, 5, 300);
var stateId = $"state_{Guid.NewGuid()}";
StateElements[stateId] = records;
StateAutomationElements[stateId] = automationElements;
StateOrder.Enqueue(stateId);
PruneStoredStates();
var elements = records.Select(record => record.ToDictionary()).ToList();
var bounds = root.Current.BoundingRectangle;
return new Dictionary<string, object?>
{
["stateId"] = stateId,
["app"] = process.ProcessName,
["platform"] = "win32",
["screenshotDataUrl"] = CaptureScreen(),
["displaySize"] = new Dictionary<string, object?>
{
["width"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Width,
["height"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Height
},
["window"] = new Dictionary<string, object?>
{
["title"] = process.MainWindowTitle,
["bounds"] = BoundsDictionary(bounds)
},
["elements"] = elements,
["accessibilityTree"] = elements,
["treeText"] = string.Join("\n", elements.Select(element => $"{element["index"]} {element["role"]} {element.GetValueOrDefault("title")}"))
};
}
private static Dictionary<string, object?> ClickElement(JsonElement parameters)
{
var mouseButton = ReadString(parameters, "mouse_button");
if ((mouseButton == "" || mouseButton == "left") && ReadInt(parameters, "click_count", 1) == 1)
{
var element = AutomationElementFor(parameters);
if (element != null && TryInvoke(element))
{
return GetAppState(parameters);
}
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("click_element requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, ReadString(parameters, "mouse_button"), ReadInt(parameters, "click_count", 1));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PerformSecondaryAction(JsonElement parameters)
{
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("perform_secondary_action requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, "right", 1);
return GetAppState(parameters);
}
private static Dictionary<string, object?> SetValue(JsonElement parameters)
{
var value = ReadString(parameters, "value");
var element = AutomationElementFor(parameters);
var focused = false;
if (element != null)
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var valuePattern))
{
((ValuePattern)valuePattern).SetValue(value);
return GetAppState(parameters);
}
try
{
element.SetFocus();
focused = true;
}
catch
{
// Fall through to coordinate focus below.
}
}
var point = PointFor(parameters);
if (point != null)
{
SendMouseClick(point.Value.X, point.Value.Y, "left", 1);
focused = true;
}
else if (!focused && element == null)
{
throw new InvalidOperationException("set_value requires x/y or stateId + element_index.");
}
else if (!focused)
{
throw new InvalidOperationException("set_value could not focus the requested element.");
}
System.Windows.Forms.SendKeys.SendWait("^a");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(value));
return GetAppState(parameters);
}
private static Dictionary<string, object?> TypeText(JsonElement parameters)
{
var text = ReadString(parameters, "text");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(text));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PressKey(JsonElement parameters)
{
var key = ReadString(parameters, "key");
System.Windows.Forms.SendKeys.SendWait(ToSendKeysChord(key));
return GetAppState(parameters);
}
private static Dictionary<string, object?> ScrollElement(JsonElement parameters)
{
var element = AutomationElementFor(parameters);
var direction = ReadString(parameters, "direction");
var pages = ReadDouble(parameters, "pages", 1);
if (element != null && element.TryGetCurrentPattern(ScrollPattern.Pattern, out var scrollPatternValue))
{
var scrollPattern = (ScrollPattern)scrollPatternValue;
var vertical = direction == "up" ? ScrollAmount.LargeDecrement : direction == "down" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
var horizontal = direction == "left" ? ScrollAmount.LargeDecrement : direction == "right" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
scrollPattern.Scroll(horizontal, vertical);
return GetAppState(parameters);
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("scroll_element requires x/y or stateId + element_index.");
}
SetCursorPos(point.Value.X, point.Value.Y);
var wheel = (int)Math.Round(Math.Max(1, pages) * 120);
if (direction == "down") wheel = -wheel;
mouse_event(0x0800, 0, 0, unchecked((uint)wheel), UIntPtr.Zero);
return GetAppState(parameters);
}
private static void PruneStoredStates()
{
while (StateOrder.Count > MaxStoredStates)
{
var evicted = StateOrder.Dequeue();
StateElements.Remove(evicted);
StateAutomationElements.Remove(evicted);
}
}
private static Dictionary<string, object?> Drag(JsonElement parameters)
{
var fromX = ReadDouble(parameters, "from_x", double.NaN);
var fromY = ReadDouble(parameters, "from_y", double.NaN);
var toX = ReadDouble(parameters, "to_x", double.NaN);
var toY = ReadDouble(parameters, "to_y", double.NaN);
if (double.IsNaN(fromX) || double.IsNaN(fromY) || double.IsNaN(toX) || double.IsNaN(toY))
{
throw new InvalidOperationException("drag requires from_x/from_y/to_x/to_y.");
}
SetCursorPos((int)Math.Round(fromX), (int)Math.Round(fromY));
mouse_event(0x0002, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
SetCursorPos((int)Math.Round(toX), (int)Math.Round(toY));
Thread.Sleep(80);
mouse_event(0x0004, 0, 0, 0, UIntPtr.Zero);
return GetAppState(parameters);
}
private static void Walk(AutomationElement element, List<ElementRecord> records, Dictionary<string, AutomationElement> automationElements, int depth, int maxDepth, int limit)
{
if (depth > maxDepth || records.Count >= limit) return;
var index = (records.Count + 1).ToString();
records.Add(ElementRecord.From(element, index));
automationElements[index] = element;
var children = element.FindAll(TreeScope.Children, Condition.TrueCondition);
foreach (AutomationElement child in children)
{
Walk(child, records, automationElements, depth + 1, maxDepth, limit);
if (records.Count >= limit) return;
}
}
private static string ReadString(JsonElement element, string property)
{
return element.TryGetProperty(property, out var value) && value.ValueKind == JsonValueKind.String
? value.GetString() ?? ""
: "";
}
private static int ReadInt(JsonElement element, string property, int defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetInt32(out var number)
? number
: defaultValue;
}
private static double ReadDouble(JsonElement element, string property, double defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetDouble(out var number)
? number
: defaultValue;
}
private static AutomationElement? AutomationElementFor(JsonElement parameters)
{
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
return !string.IsNullOrWhiteSpace(stateId)
&& !string.IsNullOrWhiteSpace(elementIndex)
&& StateAutomationElements.TryGetValue(stateId, out var elements)
&& elements.TryGetValue(elementIndex, out var element)
? element
: null;
}
private static System.Drawing.Point? PointFor(JsonElement parameters)
{
if (parameters.TryGetProperty("x", out var xValue) && parameters.TryGetProperty("y", out var yValue)
&& xValue.TryGetDouble(out var x) && yValue.TryGetDouble(out var y))
{
return new System.Drawing.Point((int)Math.Round(x), (int)Math.Round(y));
}
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
if (string.IsNullOrWhiteSpace(stateId) || string.IsNullOrWhiteSpace(elementIndex)) return null;
if (!StateElements.TryGetValue(stateId, out var elements)) return null;
var element = elements.FirstOrDefault(item => item.Index == elementIndex);
if (element?.Bounds == null) return null;
return new System.Drawing.Point(
(int)Math.Round(element.Bounds.Value.Left + element.Bounds.Value.Width / 2),
(int)Math.Round(element.Bounds.Value.Top + element.Bounds.Value.Height / 2)
);
}
private static string CaptureScreen()
{
var bounds = System.Windows.Forms.Screen.PrimaryScreen!.Bounds;
using var bitmap = new Bitmap(bounds.Width, bounds.Height);
using var graphics = Graphics.FromImage(bitmap);
graphics.CopyFromScreen(bounds.Left, bounds.Top, 0, 0, bounds.Size);
using var stream = new MemoryStream();
bitmap.Save(stream, ImageFormat.Png);
return $"data:image/png;base64,{Convert.ToBase64String(stream.ToArray())}";
}
private static Dictionary<string, object?> BoundsDictionary(System.Windows.Rect rect)
{
return new Dictionary<string, object?>
{
["x"] = rect.X,
["y"] = rect.Y,
["width"] = rect.Width,
["height"] = rect.Height
};
}
[DllImport("user32.dll")]
private static extern bool SetCursorPos(int x, int y);
[DllImport("user32.dll")]
private static extern void mouse_event(uint dwFlags, uint dx, uint dy, uint dwData, UIntPtr dwExtraInfo);
private static void SendMouseClick(int x, int y, string button, int clickCount)
{
var (down, up) = button switch
{
"right" => (0x0008u, 0x0010u),
"middle" => (0x0020u, 0x0040u),
_ => (0x0002u, 0x0004u)
};
SetCursorPos(x, y);
for (var i = 0; i < Math.Max(1, clickCount); i++)
{
mouse_event(down, 0, 0, 0, UIntPtr.Zero);
mouse_event(up, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
}
}
private static bool TryInvoke(AutomationElement element)
{
try
{
if (!element.TryGetCurrentPattern(InvokePattern.Pattern, out var pattern)) return false;
((InvokePattern)pattern).Invoke();
return true;
}
catch
{
return false;
}
}
private static string EscapeSendKeys(string value)
{
return value
.Replace("{", "{{}")
.Replace("}", "{}}")
.Replace("+", "{+}")
.Replace("^", "{^}")
.Replace("%", "{%}")
.Replace("~", "{~}")
.Replace("(", "{(}")
.Replace(")", "{)}")
.Replace("[", "{[}")
.Replace("]", "{]}");
}
private static string ToSendKeysChord(string key)
{
var normalized = key.Trim();
if (normalized.Contains('+'))
{
var parts = normalized.Split('+', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
var modifiers = "";
var last = parts.LastOrDefault() ?? "";
foreach (var part in parts.Take(parts.Length - 1))
{
modifiers += part.ToLowerInvariant() switch
{
"ctrl" or "control" => "^",
"alt" => "%",
"shift" => "+",
"cmd" or "win" or "windows" => "^",
_ => ""
};
}
return modifiers + SendKeyName(last);
}
return SendKeyName(normalized);
}
private static string SendKeyName(string key)
{
return key.ToLowerInvariant() switch
{
"return" or "enter" => "{ENTER}",
"escape" or "esc" => "{ESC}",
"tab" => "{TAB}",
"backspace" => "{BACKSPACE}",
"delete" or "del" => "{DELETE}",
"left" => "{LEFT}",
"right" => "{RIGHT}",
"up" => "{UP}",
"down" => "{DOWN}",
"space" => " ",
_ => key.Length == 1 ? EscapeSendKeys(key) : $"{{{key.ToUpperInvariant()}}}"
};
}
private sealed record ElementRecord(
string Index,
string Role,
string? Title,
string? Value,
System.Windows.Rect? Bounds,
List<string> Actions)
{
public static ElementRecord From(AutomationElement element, string index)
{
var patterns = element.GetSupportedPatterns().Select(pattern => pattern.ProgrammaticName).ToList();
return new ElementRecord(
index,
element.Current.ControlType.ProgrammaticName.Replace("ControlType.", ""),
element.Current.Name,
TryValue(element),
element.Current.BoundingRectangle,
patterns
);
}
public Dictionary<string, object?> ToDictionary()
{
var output = new Dictionary<string, object?>
{
["index"] = Index,
["role"] = Role,
["actions"] = Actions
};
if (!string.IsNullOrEmpty(Title)) output["title"] = Title;
if (!string.IsNullOrEmpty(Value)) output["value"] = Value;
if (Bounds != null) output["bounds"] = BoundsDictionary(Bounds.Value);
return output;
}
private static string? TryValue(AutomationElement element)
{
try
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var pattern))
{
return ((ValuePattern)pattern).Current.Value;
}
}
catch
{
return null;
}
return null;
}
}
}

View File

@@ -0,0 +1,87 @@
import { randomUUID } from 'node:crypto';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const DEFAULT_STATE_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_SEMANTIC_STATE_TTL_MS || String(10 * 60 * 1000), 10);
type StoredState = {
sessionId: string;
appKey: string;
state: SemanticAppState;
updatedAt: number;
};
function normalizeAppKey(app: string): string {
return app.trim().toLowerCase();
}
export class SemanticSessionStore {
private states = new Map<string, StoredState>();
private latestBySessionApp = new Map<string, string>();
createStateId(): string {
return `state_${randomUUID()}`;
}
save(sessionId: string, state: SemanticAppState): SemanticAppState {
const appKey = normalizeAppKey(state.app);
const nextState = {
...state,
stateId: state.stateId || this.createStateId(),
};
this.states.set(nextState.stateId, {
sessionId,
appKey,
state: nextState,
updatedAt: Date.now(),
});
this.latestBySessionApp.set(this.latestKey(sessionId, appKey), nextState.stateId);
return nextState;
}
getState(sessionId: string, app: string, stateId?: string): SemanticAppState | null {
this.expire();
if (stateId) {
const entry = this.states.get(stateId);
const appKey = normalizeAppKey(app);
return entry && entry.sessionId === sessionId && entry.appKey === appKey ? entry.state : null;
}
const latestStateId = this.latestBySessionApp.get(this.latestKey(sessionId, normalizeAppKey(app)));
return latestStateId ? this.states.get(latestStateId)?.state || null : null;
}
getElement(sessionId: string, app: string, elementIndex: string, stateId?: string): SemanticElement | null {
const state = this.getState(sessionId, app, stateId);
return state?.elements.find((element) => element.index === elementIndex) || null;
}
clearSession(sessionId: string): void {
for (const [stateId, entry] of this.states.entries()) {
if (entry.sessionId === sessionId) {
this.states.delete(stateId);
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
}
}
}
expire(now = Date.now()): void {
const ttl = Number.isFinite(DEFAULT_STATE_TTL_MS) && DEFAULT_STATE_TTL_MS > 0
? DEFAULT_STATE_TTL_MS
: 10 * 60 * 1000;
for (const [stateId, entry] of this.states.entries()) {
if (now - entry.updatedAt > ttl) {
this.states.delete(stateId);
const key = this.latestKey(entry.sessionId, entry.appKey);
if (this.latestBySessionApp.get(key) === stateId) {
this.latestBySessionApp.delete(key);
}
}
}
}
private latestKey(sessionId: string, appKey: string): string {
return `${sessionId}:${appKey}`;
}
}
export const semanticSessionStore = new SemanticSessionStore();

View File

@@ -0,0 +1,17 @@
export const semanticMcpToolMap: Record<string, string> = {
computer_app_drag: 'drag',
computer_click_element: 'click_element',
computer_get_app_state: 'get_app_state',
computer_list_apps: 'list_apps',
computer_perform_secondary_action: 'perform_secondary_action',
computer_press_key: 'press_key',
computer_scroll_element: 'scroll_element',
computer_set_value: 'set_value',
computer_type_text: 'type_text',
};
export const semanticOperationNames = new Set(Object.values(semanticMcpToolMap));
export function semanticOperationForMcpTool(toolName: string): string | null {
return semanticMcpToolMap[toolName] || null;
}

View File

@@ -0,0 +1,58 @@
import type { DisplaySize, Point } from '@/modules/computer-use/computer-executor.js';
export type SemanticBounds = {
x: number;
y: number;
width: number;
height: number;
};
export type SemanticApp = {
id?: string;
name: string;
bundleIdentifier?: string;
processName?: string;
pid?: number;
running: boolean;
windowTitle?: string;
};
export type SemanticElement = {
index: string;
role: string;
title?: string;
value?: string;
description?: string;
enabled?: boolean;
focused?: boolean;
selected?: boolean;
bounds?: SemanticBounds;
actions?: string[];
settableValue?: boolean;
};
export type SemanticAppState = {
stateId: string;
app: string;
platform: NodeJS.Platform;
screenshotDataUrl: string | null;
displaySize: DisplaySize | null;
elements: SemanticElement[];
accessibilityTree: SemanticElement[];
treeText?: string;
message?: string;
};
export type SemanticToolInput = Record<string, unknown> & {
sessionId?: string;
app?: string;
stateId?: string;
element_index?: string;
};
export type SemanticToolResult = SemanticAppState | {
apps: SemanticApp[];
platform: NodeJS.Platform;
};
export type SemanticActionPoint = Point;