mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-30 09:02:56 +08:00
feat: add Electron desktop app
This commit is contained in:
67
server/modules/computer-use/actions/raw-action-dispatcher.ts
Normal file
67
server/modules/computer-use/actions/raw-action-dispatcher.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
import {
|
||||
captureScreenshot,
|
||||
executor,
|
||||
type ExecutorTarget,
|
||||
} from '@/modules/computer-use/computer-executor.js';
|
||||
import type { RawActionResult, RawComputerAction, RawActionTarget } from '@/modules/computer-use/actions/raw-action-types.js';
|
||||
|
||||
const DEFAULT_WAIT_MS = 1000;
|
||||
const MAX_WAIT_MS = 10_000;
|
||||
|
||||
function normalizeWaitMs(ms: number | undefined): number {
|
||||
if (ms === undefined) {
|
||||
return DEFAULT_WAIT_MS;
|
||||
}
|
||||
if (!Number.isFinite(ms)) {
|
||||
throw new Error('Computer Use wait duration must be a finite number.');
|
||||
}
|
||||
return Math.trunc(Math.max(0, Math.min(ms, MAX_WAIT_MS)));
|
||||
}
|
||||
|
||||
async function snapshot(target: RawActionTarget): Promise<RawActionResult> {
|
||||
const { dataUrl, size } = await captureScreenshot();
|
||||
return { screenshotDataUrl: dataUrl, displaySize: size || target.displaySize };
|
||||
}
|
||||
|
||||
export async function runRawComputerAction(
|
||||
action: RawComputerAction,
|
||||
target: RawActionTarget,
|
||||
): Promise<RawActionResult> {
|
||||
const executorTarget: ExecutorTarget = {
|
||||
displaySize: target.displaySize,
|
||||
};
|
||||
|
||||
switch (action.type) {
|
||||
case 'screenshot':
|
||||
return snapshot(target);
|
||||
case 'cursor_position': {
|
||||
const position = await executor.cursorPosition(executorTarget);
|
||||
return { ...(await snapshot(target)), position, cursor: position };
|
||||
}
|
||||
case 'mouse_move':
|
||||
await executor.moveTo(executorTarget, action.point);
|
||||
return { ...(await snapshot(target)), cursor: action.point };
|
||||
case 'click':
|
||||
await executor.click(executorTarget, action.button, action.point, action.double === true);
|
||||
return { ...(await snapshot(target)), cursor: action.point ?? null };
|
||||
case 'drag':
|
||||
await executor.drag(executorTarget, action.from, action.to, action.button ?? 'left');
|
||||
return { ...(await snapshot(target)), cursor: action.to };
|
||||
case 'type':
|
||||
await executor.type(action.text);
|
||||
return snapshot(target);
|
||||
case 'key':
|
||||
await executor.pressChord(action.key);
|
||||
return snapshot(target);
|
||||
case 'scroll':
|
||||
await executor.scroll(executorTarget, action.direction, action.amount ?? 3, action.point);
|
||||
return { ...(await snapshot(target)), cursor: action.point ?? null };
|
||||
case 'wait':
|
||||
await new Promise((resolve) => setTimeout(resolve, normalizeWaitMs(action.ms)));
|
||||
return snapshot(target);
|
||||
default: {
|
||||
const exhaustive: never = action;
|
||||
throw new Error(`Unsupported computer action: ${(exhaustive as { type?: string }).type || 'unknown'}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
28
server/modules/computer-use/actions/raw-action-types.ts
Normal file
28
server/modules/computer-use/actions/raw-action-types.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import type {
|
||||
ClickButton,
|
||||
DisplaySize,
|
||||
Point,
|
||||
ScrollDirection,
|
||||
} from '@/modules/computer-use/computer-executor.js';
|
||||
|
||||
export type RawComputerAction =
|
||||
| { type: 'screenshot' }
|
||||
| { type: 'cursor_position' }
|
||||
| { type: 'mouse_move'; point: Point }
|
||||
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
|
||||
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
|
||||
| { type: 'type'; text: string }
|
||||
| { type: 'key'; key: string }
|
||||
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
|
||||
| { type: 'wait'; ms?: number };
|
||||
|
||||
export type RawActionTarget = {
|
||||
displaySize: DisplaySize | null;
|
||||
};
|
||||
|
||||
export type RawActionResult = {
|
||||
screenshotDataUrl?: string | null;
|
||||
displaySize?: DisplaySize | null;
|
||||
cursor?: Point | null;
|
||||
position?: Point | null;
|
||||
};
|
||||
242
server/modules/computer-use/computer-executor.ts
Normal file
242
server/modules/computer-use/computer-executor.ts
Normal file
@@ -0,0 +1,242 @@
|
||||
import { createRequire } from 'node:module';
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
export type Point = { x: number; y: number };
|
||||
export type ClickButton = 'left' | 'right' | 'middle';
|
||||
export type ScrollDirection = 'up' | 'down' | 'left' | 'right';
|
||||
export type DisplaySize = { width: number; height: number };
|
||||
|
||||
export type RuntimeReadiness = {
|
||||
nut: any | null;
|
||||
screenshot: any | null;
|
||||
nutInstalled: boolean;
|
||||
screenshotInstalled: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* Coordinate space the executor reports/accepts. The screenshot pixel space is
|
||||
* the canonical space agents and users address; it is mapped to the nut-js
|
||||
* logical mouse space before any action runs.
|
||||
*/
|
||||
export type ExecutorTarget = {
|
||||
displaySize: DisplaySize | null;
|
||||
};
|
||||
|
||||
export function getNut(): any | null {
|
||||
try {
|
||||
return require('@nut-tree-fork/nut-js');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function getScreenshot(): any | null {
|
||||
try {
|
||||
const mod = require('screenshot-desktop');
|
||||
return mod?.default || mod;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function getRuntimeReadiness(): RuntimeReadiness {
|
||||
const nut = getNut();
|
||||
const screenshot = getScreenshot();
|
||||
return {
|
||||
nut,
|
||||
screenshot,
|
||||
nutInstalled: Boolean(nut),
|
||||
screenshotInstalled: typeof screenshot === 'function',
|
||||
};
|
||||
}
|
||||
|
||||
/** Reads the pixel dimensions from a PNG/JPEG buffer header without decoding it. */
|
||||
export function readImageSize(buffer: Buffer): DisplaySize | null {
|
||||
// PNG: 8-byte signature, then IHDR chunk with width/height as big-endian uint32.
|
||||
if (buffer.length >= 24 && buffer[0] === 0x89 && buffer[1] === 0x50) {
|
||||
return { width: buffer.readUInt32BE(16), height: buffer.readUInt32BE(20) };
|
||||
}
|
||||
// JPEG: scan for a Start-Of-Frame marker (0xFFC0..0xFFCF, excluding C4/C8/CC).
|
||||
if (buffer.length >= 4 && buffer[0] === 0xff && buffer[1] === 0xd8) {
|
||||
let offset = 2;
|
||||
while (offset + 9 < buffer.length) {
|
||||
if (buffer[offset] !== 0xff) {
|
||||
offset += 1;
|
||||
continue;
|
||||
}
|
||||
const marker = buffer[offset + 1];
|
||||
if (marker >= 0xc0 && marker <= 0xcf && marker !== 0xc4 && marker !== 0xc8 && marker !== 0xcc) {
|
||||
return { height: buffer.readUInt16BE(offset + 5), width: buffer.readUInt16BE(offset + 7) };
|
||||
}
|
||||
offset += 2 + buffer.readUInt16BE(offset + 2);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function captureScreenshot(): Promise<{ dataUrl: string; size: DisplaySize | null }> {
|
||||
const screenshot = getScreenshot();
|
||||
if (typeof screenshot !== 'function') {
|
||||
throw new Error('Computer Use runtime is not available.');
|
||||
}
|
||||
const buffer: Buffer = await screenshot({ format: 'png' });
|
||||
return {
|
||||
dataUrl: `data:image/png;base64,${buffer.toString('base64')}`,
|
||||
size: readImageSize(buffer),
|
||||
};
|
||||
}
|
||||
|
||||
/** Returns the mouse coordinate space size (logical screen pixels). */
|
||||
export async function getMouseSpaceSize(): Promise<DisplaySize> {
|
||||
const nut = getNut();
|
||||
if (!nut) {
|
||||
throw new Error('Computer Use runtime is not available.');
|
||||
}
|
||||
const width = await nut.screen.width();
|
||||
const height = await nut.screen.height();
|
||||
return { width, height };
|
||||
}
|
||||
|
||||
/** Maps a point from screenshot/image space to the mouse coordinate space. */
|
||||
export async function toMouseSpace(target: ExecutorTarget, point: Point): Promise<Point> {
|
||||
const mouseSize = await getMouseSpaceSize();
|
||||
const image = target.displaySize || mouseSize;
|
||||
const scaleX = image.width ? mouseSize.width / image.width : 1;
|
||||
const scaleY = image.height ? mouseSize.height / image.height : 1;
|
||||
return {
|
||||
x: Math.round(point.x * scaleX),
|
||||
y: Math.round(point.y * scaleY),
|
||||
};
|
||||
}
|
||||
|
||||
/** Maps a point from the mouse coordinate space back to screenshot/image space. */
|
||||
export function toImageSpace(target: ExecutorTarget, point: Point, mouseSize: DisplaySize): Point {
|
||||
const image = target.displaySize || mouseSize;
|
||||
const scaleX = mouseSize.width ? image.width / mouseSize.width : 1;
|
||||
const scaleY = mouseSize.height ? image.height / mouseSize.height : 1;
|
||||
return {
|
||||
x: Math.round(point.x * scaleX),
|
||||
y: Math.round(point.y * scaleY),
|
||||
};
|
||||
}
|
||||
|
||||
function nutButton(nut: any, button: ClickButton) {
|
||||
if (button === 'right') return nut.Button.RIGHT;
|
||||
if (button === 'middle') return nut.Button.MIDDLE;
|
||||
return nut.Button.LEFT;
|
||||
}
|
||||
|
||||
/** Maps a key name (xdotool-style, as Anthropic's computer tool emits) to a nut-js Key. */
|
||||
function nutKey(nut: any, token: string): any {
|
||||
const map: Record<string, string> = {
|
||||
return: 'Enter', enter: 'Enter', esc: 'Escape', escape: 'Escape', tab: 'Tab',
|
||||
space: 'Space', backspace: 'Backspace', delete: 'Delete', del: 'Delete', insert: 'Insert',
|
||||
up: 'Up', down: 'Down', left: 'Left', right: 'Right',
|
||||
home: 'Home', end: 'End', pageup: 'PageUp', page_up: 'PageUp', pagedown: 'PageDown', page_down: 'PageDown',
|
||||
ctrl: 'LeftControl', control: 'LeftControl', alt: 'LeftAlt', shift: 'LeftShift',
|
||||
meta: 'LeftSuper', super: 'LeftSuper', cmd: 'LeftSuper', win: 'LeftSuper',
|
||||
capslock: 'CapsLock',
|
||||
};
|
||||
const lower = token.toLowerCase();
|
||||
if (map[lower]) {
|
||||
return nut.Key[map[lower]];
|
||||
}
|
||||
if (/^f([1-9]|1[0-9]|2[0-4])$/.test(lower)) {
|
||||
return nut.Key[`F${lower.slice(1)}`];
|
||||
}
|
||||
if (token.length === 1) {
|
||||
const upper = token.toUpperCase();
|
||||
if (nut.Key[upper] !== undefined) {
|
||||
return nut.Key[upper];
|
||||
}
|
||||
if (nut.Key[`Num${token}`] !== undefined && /[0-9]/.test(token)) {
|
||||
return nut.Key[`Num${token}`];
|
||||
}
|
||||
}
|
||||
throw new Error(`Unsupported key: ${token}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* The cross-platform OS executor. It is intentionally free of any server,
|
||||
* database, or session dependencies so it can run both inside the local server
|
||||
* process (OSS mode) and inside the standalone desktop agent (cloud relay).
|
||||
*/
|
||||
export const executor = {
|
||||
async configure() {
|
||||
const nut = getNut();
|
||||
if (nut) {
|
||||
// Make actions responsive; the agent loop already paces itself with screenshots.
|
||||
nut.mouse.config.autoDelayMs = 2;
|
||||
nut.keyboard.config.autoDelayMs = 2;
|
||||
}
|
||||
return nut;
|
||||
},
|
||||
|
||||
async cursorPosition(target: ExecutorTarget): Promise<Point> {
|
||||
const nut = await this.configure();
|
||||
const mouseSize = await getMouseSpaceSize();
|
||||
const pos = await nut.mouse.getPosition();
|
||||
return toImageSpace(target, { x: pos.x, y: pos.y }, mouseSize);
|
||||
},
|
||||
|
||||
async moveTo(target: ExecutorTarget, point: Point): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
const dest = await toMouseSpace(target, point);
|
||||
await nut.mouse.setPosition(new nut.Point(dest.x, dest.y));
|
||||
},
|
||||
|
||||
async click(target: ExecutorTarget, button: ClickButton, point?: Point, doubleClick = false): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
if (point) {
|
||||
await this.moveTo(target, point);
|
||||
}
|
||||
if (doubleClick) {
|
||||
await nut.mouse.doubleClick(nutButton(nut, button));
|
||||
} else {
|
||||
await nut.mouse.click(nutButton(nut, button));
|
||||
}
|
||||
},
|
||||
|
||||
async drag(target: ExecutorTarget, from: Point, to: Point, button: ClickButton = 'left'): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
const start = await toMouseSpace(target, from);
|
||||
const end = await toMouseSpace(target, to);
|
||||
await nut.mouse.setPosition(new nut.Point(start.x, start.y));
|
||||
await nut.mouse.pressButton(nutButton(nut, button));
|
||||
await nut.mouse.setPosition(new nut.Point(end.x, end.y));
|
||||
await nut.mouse.releaseButton(nutButton(nut, button));
|
||||
},
|
||||
|
||||
async type(text: string): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
await nut.keyboard.type(text);
|
||||
},
|
||||
|
||||
async pressChord(chord: string): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
const tokens = chord.split('+').map((token) => token.trim()).filter(Boolean);
|
||||
if (tokens.length === 0) {
|
||||
return;
|
||||
}
|
||||
const keys = tokens.map((token) => nutKey(nut, token));
|
||||
for (const key of keys) {
|
||||
await nut.keyboard.pressKey(key);
|
||||
}
|
||||
for (const key of [...keys].reverse()) {
|
||||
await nut.keyboard.releaseKey(key);
|
||||
}
|
||||
},
|
||||
|
||||
async scroll(target: ExecutorTarget, direction: ScrollDirection, amount: number, point?: Point): Promise<void> {
|
||||
const nut = await this.configure();
|
||||
if (point) {
|
||||
await this.moveTo(target, point);
|
||||
}
|
||||
const steps = Math.max(1, Math.round(amount));
|
||||
if (direction === 'up') await nut.mouse.scrollUp(steps);
|
||||
else if (direction === 'down') await nut.mouse.scrollDown(steps);
|
||||
else if (direction === 'left') await nut.mouse.scrollLeft(steps);
|
||||
else await nut.mouse.scrollRight(steps);
|
||||
},
|
||||
};
|
||||
460
server/modules/computer-use/computer-semantics.service.ts
Normal file
460
server/modules/computer-use/computer-semantics.service.ts
Normal file
@@ -0,0 +1,460 @@
|
||||
import { execFile } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
|
||||
import {
|
||||
captureScreenshot,
|
||||
executor,
|
||||
type ClickButton,
|
||||
type ExecutorTarget,
|
||||
type Point,
|
||||
type ScrollDirection,
|
||||
} from '@/modules/computer-use/computer-executor.js';
|
||||
import type { SemanticAdapter } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
|
||||
import { createMacOsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/macos/macos-semantic-adapter.js';
|
||||
import { createWindowsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/windows/windows-semantic-adapter.js';
|
||||
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
|
||||
import { semanticSessionStore } from '@/modules/computer-use/semantics/semantic-session-store.js';
|
||||
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
const MAX_APP_STATE_ELEMENTS = 250;
|
||||
let helperAdapter: SemanticAdapter | null | undefined;
|
||||
|
||||
function readString(value: unknown): string {
|
||||
return typeof value === 'string' ? value.trim() : '';
|
||||
}
|
||||
|
||||
function requireApp(input: Record<string, unknown>): string {
|
||||
const app = readString(input.app);
|
||||
if (!app) {
|
||||
throw new Error('app is required.');
|
||||
}
|
||||
return app;
|
||||
}
|
||||
|
||||
function readNumber(value: unknown): number | undefined {
|
||||
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function readButton(value: unknown): ClickButton {
|
||||
return value === 'right' || value === 'middle' ? value : 'left';
|
||||
}
|
||||
|
||||
function readClickCount(value: unknown): number {
|
||||
const count = readNumber(value);
|
||||
if (count === undefined) {
|
||||
return 1;
|
||||
}
|
||||
return Math.max(1, Math.min(5, Math.trunc(count)));
|
||||
}
|
||||
|
||||
function readDirection(value: unknown): ScrollDirection {
|
||||
return value === 'up' || value === 'left' || value === 'right' ? value : 'down';
|
||||
}
|
||||
|
||||
function readSessionId(input: Record<string, unknown>): string {
|
||||
return readString(input.sessionId) || 'default';
|
||||
}
|
||||
|
||||
function centerOf(element: SemanticElement): Point | null {
|
||||
const bounds = element.bounds;
|
||||
if (!bounds) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
x: Math.round(bounds.x + bounds.width / 2),
|
||||
y: Math.round(bounds.y + bounds.height / 2),
|
||||
};
|
||||
}
|
||||
|
||||
function getCachedElement(sessionId: string, app: string, index: string, stateId?: string): SemanticElement | null {
|
||||
return semanticSessionStore.getElement(sessionId, app, index, stateId);
|
||||
}
|
||||
|
||||
function getPoint(input: Record<string, unknown>, sessionId: string, app: string): Point | undefined {
|
||||
const x = readNumber(input.x);
|
||||
const y = readNumber(input.y);
|
||||
if (x !== undefined && y !== undefined) {
|
||||
return { x, y };
|
||||
}
|
||||
|
||||
const elementIndex = readString(input.element_index);
|
||||
if (!elementIndex) {
|
||||
return undefined;
|
||||
}
|
||||
const element = getCachedElement(sessionId, app, elementIndex, readString(input.stateId) || undefined);
|
||||
return element ? centerOf(element) || undefined : undefined;
|
||||
}
|
||||
|
||||
function getHelperAdapter(): SemanticAdapter | null {
|
||||
if (helperAdapter !== undefined) {
|
||||
return helperAdapter;
|
||||
}
|
||||
|
||||
if (process.platform !== 'darwin' && process.platform !== 'win32') {
|
||||
helperAdapter = null;
|
||||
return helperAdapter;
|
||||
}
|
||||
|
||||
const resolution = resolveSemanticHelper();
|
||||
if (!resolution.available) {
|
||||
helperAdapter = null;
|
||||
return helperAdapter;
|
||||
}
|
||||
|
||||
helperAdapter = process.platform === 'darwin'
|
||||
? createMacOsSemanticAdapter()
|
||||
: createWindowsSemanticAdapter();
|
||||
return helperAdapter;
|
||||
}
|
||||
|
||||
function shouldFallbackFromHelper(error: unknown): boolean {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return /not implemented|unavailable|not found|does not exist|timed out|not running|exited with code|failed to start/i.test(message);
|
||||
}
|
||||
|
||||
async function withHelperState(
|
||||
sessionId: string,
|
||||
operation: (adapter: SemanticAdapter) => Promise<SemanticAppState>,
|
||||
): Promise<SemanticAppState | null> {
|
||||
const adapter = getHelperAdapter();
|
||||
if (!adapter) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return semanticSessionStore.save(sessionId, await operation(adapter));
|
||||
} catch (error) {
|
||||
if (shouldFallbackFromHelper(error)) {
|
||||
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function run(command: string, args: string[], timeout = 5000): Promise<string> {
|
||||
const { stdout } = await execFileAsync(command, args, {
|
||||
timeout,
|
||||
windowsHide: true,
|
||||
maxBuffer: 1024 * 1024 * 4,
|
||||
});
|
||||
return stdout;
|
||||
}
|
||||
|
||||
async function listMacApps(): Promise<Array<Record<string, unknown>>> {
|
||||
const script = [
|
||||
'tell application "System Events"',
|
||||
'set appRows to {}',
|
||||
'repeat with p in (application processes whose background only is false)',
|
||||
'set end of appRows to (name of p as text)',
|
||||
'end repeat',
|
||||
'return appRows',
|
||||
'end tell',
|
||||
].join('\n');
|
||||
const output = await run('osascript', ['-e', script]);
|
||||
return output.split(', ')
|
||||
.map((name) => name.trim())
|
||||
.filter(Boolean)
|
||||
.map((name) => ({ name, running: true }));
|
||||
}
|
||||
|
||||
async function listWindowsApps(): Promise<Array<Record<string, unknown>>> {
|
||||
const script = [
|
||||
'Get-Process | Where-Object { $_.MainWindowTitle } |',
|
||||
'Select-Object ProcessName, Id, MainWindowTitle | ConvertTo-Json -Depth 3',
|
||||
].join(' ');
|
||||
const output = await run('powershell.exe', ['-NoProfile', '-Command', script]);
|
||||
const parsed = JSON.parse(output || '[]');
|
||||
const rows = Array.isArray(parsed) ? parsed : [parsed];
|
||||
return rows.map((row) => ({
|
||||
name: row.ProcessName,
|
||||
pid: row.Id,
|
||||
windowTitle: row.MainWindowTitle,
|
||||
running: true,
|
||||
}));
|
||||
}
|
||||
|
||||
async function listLinuxApps(): Promise<Array<Record<string, unknown>>> {
|
||||
try {
|
||||
const output = await run('wmctrl', ['-lx']);
|
||||
return output.split(/\r?\n/)
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.map((line) => {
|
||||
const parts = line.split(/\s+/);
|
||||
return {
|
||||
windowId: parts[0],
|
||||
desktop: parts[1],
|
||||
host: parts[2],
|
||||
className: parts[3],
|
||||
windowTitle: parts.slice(4).join(' '),
|
||||
running: true,
|
||||
};
|
||||
});
|
||||
} catch {
|
||||
const output = await run('ps', ['-eo', 'comm=']);
|
||||
return [...new Set(output.split(/\r?\n/).map((name) => name.trim()).filter(Boolean))]
|
||||
.slice(0, 200)
|
||||
.map((name) => ({ name, running: true }));
|
||||
}
|
||||
}
|
||||
|
||||
async function listApps(): Promise<Array<Record<string, unknown>>> {
|
||||
if (process.platform === 'darwin') {
|
||||
return listMacApps();
|
||||
}
|
||||
if (process.platform === 'win32') {
|
||||
return listWindowsApps();
|
||||
}
|
||||
return listLinuxApps();
|
||||
}
|
||||
|
||||
async function macAccessibilityTree(app: string): Promise<SemanticElement[]> {
|
||||
const escapedApp = app.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
|
||||
const script = `
|
||||
on safeText(v)
|
||||
try
|
||||
return v as text
|
||||
on error
|
||||
return ""
|
||||
end try
|
||||
end safeText
|
||||
|
||||
on emitElement(e, depth, maxDepth, counter)
|
||||
if depth > maxDepth then return {}
|
||||
set rows to {}
|
||||
try
|
||||
set roleText to my safeText(role of e)
|
||||
on error
|
||||
set roleText to "element"
|
||||
end try
|
||||
try
|
||||
set titleText to my safeText(title of e)
|
||||
on error
|
||||
set titleText to ""
|
||||
end try
|
||||
try
|
||||
set valueText to my safeText(value of e)
|
||||
on error
|
||||
set valueText to ""
|
||||
end try
|
||||
try
|
||||
set posValue to position of e
|
||||
set sizeValue to size of e
|
||||
set boundsText to ((item 1 of posValue) as text) & "," & ((item 2 of posValue) as text) & "," & ((item 1 of sizeValue) as text) & "," & ((item 2 of sizeValue) as text)
|
||||
on error
|
||||
set boundsText to ""
|
||||
end try
|
||||
set end of rows to ((counter as text) & tab & roleText & tab & titleText & tab & valueText & tab & boundsText)
|
||||
if counter > ${MAX_APP_STATE_ELEMENTS} then return rows
|
||||
try
|
||||
repeat with childElement in UI elements of e
|
||||
set childRows to my emitElement(childElement, depth + 1, maxDepth, counter + (count of rows))
|
||||
set rows to rows & childRows
|
||||
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then return rows
|
||||
end repeat
|
||||
end try
|
||||
return rows
|
||||
end emitElement
|
||||
|
||||
tell application "System Events"
|
||||
if not (exists process "${escapedApp}") then error "App is not running: ${escapedApp}"
|
||||
tell process "${escapedApp}"
|
||||
set rows to {}
|
||||
repeat with w in windows
|
||||
set rows to rows & my emitElement(w, 0, 4, (count of rows) + 1)
|
||||
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then exit repeat
|
||||
end repeat
|
||||
return rows
|
||||
end tell
|
||||
end tell
|
||||
`;
|
||||
const output = await run('osascript', ['-e', script], 10000);
|
||||
return output.split(/\r?\n|, /)
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.map((line, index) => {
|
||||
const [rawIndex, role, title, value, boundsText] = line.split('\t');
|
||||
const boundsParts = (boundsText || '').split(',').map((part) => Number.parseFloat(part));
|
||||
const hasBounds = boundsParts.length === 4 && boundsParts.every(Number.isFinite);
|
||||
return {
|
||||
index: rawIndex || String(index + 1),
|
||||
role: role || 'element',
|
||||
title: title || undefined,
|
||||
value: value || undefined,
|
||||
bounds: hasBounds
|
||||
? { x: boundsParts[0], y: boundsParts[1], width: boundsParts[2], height: boundsParts[3] }
|
||||
: undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getAccessibilityTree(app: string): Promise<{ elements: SemanticElement[]; message?: string }> {
|
||||
if (process.platform === 'darwin') {
|
||||
try {
|
||||
return { elements: await macAccessibilityTree(app) };
|
||||
} catch (error) {
|
||||
return { elements: [], message: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
elements: [],
|
||||
message: 'Native accessibility tree capture is not implemented for this platform yet.',
|
||||
};
|
||||
}
|
||||
|
||||
async function getAppState(sessionId: string, app: string): Promise<SemanticAppState> {
|
||||
if (!app) {
|
||||
throw new Error('app is required.');
|
||||
}
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.getAppState({ sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
|
||||
const screenshot = await captureScreenshot();
|
||||
const tree = await getAccessibilityTree(app);
|
||||
const state: SemanticAppState = {
|
||||
stateId: semanticSessionStore.createStateId(),
|
||||
app,
|
||||
platform: process.platform,
|
||||
screenshotDataUrl: screenshot.dataUrl,
|
||||
displaySize: screenshot.size,
|
||||
elements: tree.elements,
|
||||
accessibilityTree: tree.elements,
|
||||
message: tree.message,
|
||||
};
|
||||
return semanticSessionStore.save(sessionId, state);
|
||||
}
|
||||
|
||||
async function targetFor(sessionId: string, app: string, stateId?: string): Promise<ExecutorTarget> {
|
||||
const cached = semanticSessionStore.getState(sessionId, app, stateId);
|
||||
return { displaySize: cached?.displaySize || (await captureScreenshot()).size };
|
||||
}
|
||||
|
||||
export const computerSemanticsService = {
|
||||
async callTool(name: string, input: Record<string, unknown>): Promise<unknown> {
|
||||
const sessionId = readSessionId(input);
|
||||
switch (name) {
|
||||
case 'list_apps': {
|
||||
const adapter = getHelperAdapter();
|
||||
if (adapter) {
|
||||
try {
|
||||
return { apps: await adapter.listApps(), platform: process.platform };
|
||||
} catch (error) {
|
||||
if (!shouldFallbackFromHelper(error)) {
|
||||
throw error;
|
||||
}
|
||||
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
}
|
||||
return { apps: await listApps(), platform: process.platform };
|
||||
}
|
||||
case 'get_app_state':
|
||||
return getAppState(sessionId, readString(input.app));
|
||||
case 'click':
|
||||
case 'click_element': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.clickElement({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
const stateId = readString(input.stateId) || undefined;
|
||||
const point = getPoint(input, sessionId, app);
|
||||
if (!point) {
|
||||
throw new Error('click requires x/y or an element_index from computer_get_app_state.');
|
||||
}
|
||||
const target = await targetFor(sessionId, app, stateId);
|
||||
const button = readButton(input.mouse_button ?? input.mouseButton);
|
||||
const clickCount = readClickCount(input.click_count ?? input.clickCount);
|
||||
for (let index = 0; index < clickCount; index += 1) {
|
||||
await executor.click(target, button, point, false);
|
||||
}
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'drag': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.drag({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
const stateId = readString(input.stateId) || undefined;
|
||||
const fromX = readNumber(input.from_x);
|
||||
const fromY = readNumber(input.from_y);
|
||||
const toX = readNumber(input.to_x);
|
||||
const toY = readNumber(input.to_y);
|
||||
if (fromX === undefined || fromY === undefined || toX === undefined || toY === undefined) {
|
||||
throw new Error('drag requires from_x/from_y/to_x/to_y.');
|
||||
}
|
||||
await executor.drag(await targetFor(sessionId, app, stateId), { x: fromX, y: fromY }, { x: toX, y: toY }, readButton(input.mouse_button ?? input.mouseButton));
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'scroll':
|
||||
case 'scroll_element': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.scrollElement({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
const stateId = readString(input.stateId) || undefined;
|
||||
const point = getPoint(input, sessionId, app);
|
||||
if (!point) {
|
||||
throw new Error('scroll requires x/y or an element_index from computer_get_app_state.');
|
||||
}
|
||||
await executor.scroll(await targetFor(sessionId, app, stateId), readDirection(input.direction), readNumber(input.pages) ?? 1, point);
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'type_text': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.typeText({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
await executor.type(readString(input.text));
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'press_key': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.pressKey({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
await executor.pressChord(readString(input.key));
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'set_value': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.setValue({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
const stateId = readString(input.stateId) || undefined;
|
||||
const point = getPoint(input, sessionId, app);
|
||||
if (!point) {
|
||||
throw new Error('set_value requires x/y or an element_index from computer_get_app_state.');
|
||||
}
|
||||
await executor.click(await targetFor(sessionId, app, stateId), 'left', point, false);
|
||||
await executor.pressChord(process.platform === 'darwin' ? 'cmd+a' : 'ctrl+a');
|
||||
await executor.type(readString(input.value));
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
case 'perform_secondary_action': {
|
||||
const app = requireApp(input);
|
||||
const helperState = await withHelperState(sessionId, (adapter) => adapter.performSecondaryAction({ ...input, sessionId, app }));
|
||||
if (helperState) {
|
||||
return helperState;
|
||||
}
|
||||
const stateId = readString(input.stateId) || undefined;
|
||||
const point = getPoint(input, sessionId, app);
|
||||
if (!point) {
|
||||
throw new Error('perform_secondary_action requires x/y or an element_index from computer_get_app_state.');
|
||||
}
|
||||
await executor.click(await targetFor(sessionId, app, stateId), 'right', point, false);
|
||||
return getAppState(sessionId, app);
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown semantic Computer Use tool: ${name}`);
|
||||
}
|
||||
},
|
||||
};
|
||||
141
server/modules/computer-use/computer-use-mcp.routes.ts
Normal file
141
server/modules/computer-use/computer-use-mcp.routes.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
import express from 'express';
|
||||
|
||||
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
|
||||
import { semanticOperationForMcpTool } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
|
||||
|
||||
const router = express.Router();
|
||||
|
||||
function readBearerToken(header: unknown): string | null {
|
||||
if (typeof header !== 'string') {
|
||||
return null;
|
||||
}
|
||||
const trimmed = header.trim();
|
||||
const scheme = 'Bearer';
|
||||
if (trimmed.slice(0, scheme.length).toLowerCase() !== scheme.toLowerCase()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const separator = trimmed[scheme.length];
|
||||
if (separator !== ' ' && separator !== '\t') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return trimmed.slice(scheme.length + 1).trimStart() || null;
|
||||
}
|
||||
|
||||
function toButton(value: unknown): 'left' | 'right' | 'middle' {
|
||||
return value === 'right' || value === 'middle' ? value : 'left';
|
||||
}
|
||||
|
||||
function toScrollDirection(value: unknown): 'up' | 'down' | 'left' | 'right' {
|
||||
return value === 'down' || value === 'left' || value === 'right' ? value : 'up';
|
||||
}
|
||||
|
||||
function point(input: Record<string, unknown>): { x: number; y: number } | undefined {
|
||||
return typeof input.x === 'number' && typeof input.y === 'number'
|
||||
? { x: input.x, y: input.y }
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function requireNumber(input: Record<string, unknown>, name: string): number {
|
||||
const value = input[name];
|
||||
if (typeof value !== 'number' || !Number.isFinite(value)) {
|
||||
throw new Error(`${name} is required and must be a finite number.`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function requirePoint(input: Record<string, unknown>): { x: number; y: number } {
|
||||
return { x: requireNumber(input, 'x'), y: requireNumber(input, 'y') };
|
||||
}
|
||||
|
||||
function requireNamedPoint(input: Record<string, unknown>, xName: string, yName: string): { x: number; y: number } {
|
||||
return { x: requireNumber(input, xName), y: requireNumber(input, yName) };
|
||||
}
|
||||
|
||||
router.use((req, res, next) => {
|
||||
const expected = computerUseService.getMcpToken();
|
||||
const token = readBearerToken(req.headers.authorization) || String(req.headers['x-computer-use-mcp-token'] || '');
|
||||
if (!token || token !== expected) {
|
||||
res.status(401).json({ success: false, error: 'Invalid Computer Use MCP token.' });
|
||||
return;
|
||||
}
|
||||
next();
|
||||
});
|
||||
|
||||
router.post('/tools/:toolName', async (req, res) => {
|
||||
try {
|
||||
const input = (req.body && typeof req.body === 'object' ? req.body : {}) as Record<string, unknown>;
|
||||
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : undefined;
|
||||
const toolName = req.params.toolName;
|
||||
const semanticOperation = semanticOperationForMcpTool(toolName);
|
||||
let result: unknown;
|
||||
|
||||
if (semanticOperation) {
|
||||
result = await computerUseService.callSemanticTool(semanticOperation, input);
|
||||
res.json({ success: true, data: result });
|
||||
return;
|
||||
}
|
||||
|
||||
switch (toolName) {
|
||||
case 'computer_screenshot':
|
||||
result = await computerUseService.agentScreenshot(sessionId);
|
||||
break;
|
||||
case 'computer_cursor_position':
|
||||
result = await computerUseService.agentCursorPosition(sessionId);
|
||||
break;
|
||||
case 'computer_mouse_move':
|
||||
result = await computerUseService.agentMouseMove(sessionId, requirePoint(input));
|
||||
break;
|
||||
case 'computer_click':
|
||||
result = await computerUseService.agentUnifiedClick(sessionId, {
|
||||
button: toButton(input.mouseButton ?? input.mouse_button ?? input.button),
|
||||
point: point(input),
|
||||
clickCount: typeof input.clickCount === 'number'
|
||||
? input.clickCount
|
||||
: typeof input.click_count === 'number'
|
||||
? input.click_count
|
||||
: 1,
|
||||
});
|
||||
break;
|
||||
case 'computer_drag': {
|
||||
const from = requireNamedPoint(input, 'startX', 'startY');
|
||||
const to = requireNamedPoint(input, 'endX', 'endY');
|
||||
result = await computerUseService.agentDrag(sessionId, from, to, toButton(input.mouseButton ?? input.mouse_button ?? input.button));
|
||||
break;
|
||||
}
|
||||
case 'computer_type':
|
||||
result = await computerUseService.agentType(sessionId, String(input.text || ''));
|
||||
break;
|
||||
case 'computer_key':
|
||||
result = await computerUseService.agentKey(sessionId, String(input.key || ''));
|
||||
break;
|
||||
case 'computer_scroll':
|
||||
result = await computerUseService.agentScroll(sessionId, {
|
||||
direction: toScrollDirection(input.direction),
|
||||
amount: typeof input.amount === 'number' ? input.amount : undefined,
|
||||
x: typeof input.x === 'number' ? input.x : undefined,
|
||||
y: typeof input.y === 'number' ? input.y : undefined,
|
||||
});
|
||||
break;
|
||||
case 'computer_wait':
|
||||
result = await computerUseService.agentWait(sessionId, typeof input.timeoutMs === 'number' ? input.timeoutMs : undefined);
|
||||
break;
|
||||
case 'computer_close_session':
|
||||
result = await computerUseService.agentStopSession(sessionId);
|
||||
break;
|
||||
default:
|
||||
res.status(404).json({ success: false, error: `Unknown Computer Use MCP tool "${toolName}".` });
|
||||
return;
|
||||
}
|
||||
|
||||
res.json({ success: true, data: result });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Computer Use MCP tool failed.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
211
server/modules/computer-use/computer-use.routes.ts
Normal file
211
server/modules/computer-use/computer-use.routes.ts
Normal file
@@ -0,0 +1,211 @@
|
||||
import express from 'express';
|
||||
|
||||
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
|
||||
import { AppError } from '@/shared/utils.js';
|
||||
|
||||
const router = express.Router();
|
||||
|
||||
type AuthenticatedRequest = express.Request & {
|
||||
user?: {
|
||||
id?: string | number;
|
||||
};
|
||||
};
|
||||
|
||||
function requireUser(req: AuthenticatedRequest): { id: string | number } {
|
||||
const userId = req.user?.id;
|
||||
if (userId === undefined || userId === null || String(userId).trim() === '') {
|
||||
throw new AppError('Authenticated user is required.', {
|
||||
code: 'AUTHENTICATED_USER_REQUIRED',
|
||||
statusCode: 401,
|
||||
});
|
||||
}
|
||||
return { id: userId };
|
||||
}
|
||||
|
||||
function getErrorStatusCode(error: unknown, fallbackStatusCode: number): number {
|
||||
if (error instanceof AppError) {
|
||||
return error.statusCode;
|
||||
}
|
||||
|
||||
if (error && typeof error === 'object') {
|
||||
const statusCode = 'statusCode' in error ? error.statusCode : 'status' in error ? error.status : undefined;
|
||||
if (typeof statusCode === 'number' && Number.isInteger(statusCode) && statusCode >= 400 && statusCode <= 599) {
|
||||
return statusCode;
|
||||
}
|
||||
}
|
||||
|
||||
return fallbackStatusCode;
|
||||
}
|
||||
|
||||
function readParam(value: string | string[] | undefined): string {
|
||||
return Array.isArray(value) ? value[0] || '' : value || '';
|
||||
}
|
||||
|
||||
function toButton(value: unknown): 'left' | 'right' | 'middle' {
|
||||
return value === 'right' || value === 'middle' ? value : 'left';
|
||||
}
|
||||
|
||||
router.get('/status', async (_req, res) => {
|
||||
try {
|
||||
res.json({ success: true, data: await computerUseService.getStatus() });
|
||||
} catch (error) {
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to load Computer Use status.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.get('/settings', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
requireUser(req);
|
||||
res.json({ success: true, data: { settings: await computerUseService.getSettings() } });
|
||||
} catch (error) {
|
||||
res.status(getErrorStatusCode(error, 500)).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to load Computer Use settings.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.put('/settings', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
requireUser(req);
|
||||
const settings = await computerUseService.updateSettings(req.body || {});
|
||||
res.json({ success: true, data: { settings } });
|
||||
} catch (error) {
|
||||
res.status(getErrorStatusCode(error, 400)).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to save Computer Use settings.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/runtime/install', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
requireUser(req);
|
||||
const result = await computerUseService.installRuntime();
|
||||
res.status(result.success ? 200 : 500).json({
|
||||
success: result.success,
|
||||
data: result,
|
||||
error: result.success ? undefined : result.message,
|
||||
});
|
||||
} catch (error) {
|
||||
res.status(getErrorStatusCode(error, 500)).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to install Computer Use runtime.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.get('/sessions', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
res.json({ success: true, data: { sessions: await computerUseService.listSessions(requireUser(req)) } });
|
||||
} catch (error) {
|
||||
res.status(getErrorStatusCode(error, 500)).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to list Computer Use sessions.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/screenshot', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const session = await computerUseService.userScreenshot(requireUser(req), readParam(req.params.sessionId));
|
||||
res.json({ success: true, data: { session } });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to capture the screen.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/click', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const x = Number(req.body?.x);
|
||||
const y = Number(req.body?.y);
|
||||
if (!Number.isFinite(x) || !Number.isFinite(y)) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: 'Valid numeric coordinates are required.',
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const session = await computerUseService.userClick(requireUser(req), readParam(req.params.sessionId), {
|
||||
x,
|
||||
y,
|
||||
button: toButton(req.body?.button),
|
||||
double: req.body?.double === true,
|
||||
});
|
||||
res.json({ success: true, data: { session } });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to click.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/press-key', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const session = await computerUseService.userPressKey(requireUser(req), readParam(req.params.sessionId), String(req.body?.key || ''));
|
||||
res.json({ success: true, data: { session } });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to send key input.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/consent/grant', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const session = await computerUseService.grantAgentAccess(requireUser(req), readParam(req.params.sessionId));
|
||||
res.json({ success: true, data: { session } });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to grant control.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/consent/revoke', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const session = await computerUseService.revokeAgentAccess(requireUser(req), readParam(req.params.sessionId));
|
||||
res.json({ success: true, data: { session } });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to revoke control.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/sessions/:sessionId/stop', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const result = await computerUseService.stopSession(requireUser(req), readParam(req.params.sessionId));
|
||||
res.json({ success: true, data: result });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to stop Computer Use session.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
router.delete('/sessions/:sessionId', async (req: AuthenticatedRequest, res) => {
|
||||
try {
|
||||
const result = await computerUseService.deleteSession(requireUser(req), readParam(req.params.sessionId));
|
||||
res.json({ success: true, data: result });
|
||||
} catch (error) {
|
||||
res.status(400).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Failed to delete Computer Use session.',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
920
server/modules/computer-use/computer-use.service.ts
Normal file
920
server/modules/computer-use/computer-use.service.ts
Normal file
@@ -0,0 +1,920 @@
|
||||
import { randomBytes, randomUUID } from 'node:crypto';
|
||||
import { spawn } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
|
||||
import { appConfigDb } from '@/modules/database/index.js';
|
||||
import { providerMcpService } from '@/modules/providers/index.js';
|
||||
import { getModuleDir } from '@/utils/runtime-paths.js';
|
||||
import {
|
||||
getRuntimeReadiness as getExecutorReadiness,
|
||||
type Point,
|
||||
type ClickButton,
|
||||
type ScrollDirection,
|
||||
} from '@/modules/computer-use/computer-executor.js';
|
||||
import { runRawComputerAction } from '@/modules/computer-use/actions/raw-action-dispatcher.js';
|
||||
import type { RawComputerAction } from '@/modules/computer-use/actions/raw-action-types.js';
|
||||
import { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';
|
||||
import { computerSemanticsService } from '@/modules/computer-use/computer-semantics.service.js';
|
||||
import { semanticOperationNames } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
|
||||
|
||||
const __dirname = getModuleDir(import.meta.url);
|
||||
const IS_PLATFORM = process.env.VITE_IS_PLATFORM === 'true';
|
||||
const MAX_SESSIONS_PER_OWNER = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_MAX_SESSIONS_PER_OWNER || '1', 10);
|
||||
const SESSION_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_SESSION_TTL_MS || String(30 * 60 * 1000), 10);
|
||||
const STOPPED_SESSION_RETENTION_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_STOPPED_SESSION_RETENTION_MS || String(30 * 60 * 1000), 10);
|
||||
const MAX_STORED_SESSIONS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_MAX_STORED_SESSIONS || '100', 10);
|
||||
const COMPUTER_USE_SETTINGS_KEY = 'computer_use_settings';
|
||||
const COMPUTER_USE_MCP_TOKEN_KEY = 'computer_use_mcp_token';
|
||||
type ComputerUseRuntime = 'cloud' | 'local';
|
||||
type ComputerUseSessionStatus = 'ready' | 'stopped' | 'unavailable';
|
||||
|
||||
type ComputerUseSession = {
|
||||
id: string;
|
||||
ownerId: string;
|
||||
createdBy: 'user' | 'agent';
|
||||
runtime: ComputerUseRuntime;
|
||||
status: ComputerUseSessionStatus;
|
||||
screenshotDataUrl: string | null;
|
||||
createdAt: string;
|
||||
updatedAt: string;
|
||||
lastAction: string | null;
|
||||
message: string | null;
|
||||
/** Per-session consent: agents may act only while this is true. */
|
||||
agentAccessEnabled: boolean;
|
||||
/** Size of the captured screenshot in pixels — the coordinate space agents/users use. */
|
||||
displaySize: {
|
||||
width: number;
|
||||
height: number;
|
||||
} | null;
|
||||
cursor: {
|
||||
x: number;
|
||||
y: number;
|
||||
actor: 'agent' | 'user';
|
||||
} | null;
|
||||
};
|
||||
|
||||
type PublicComputerUseSession = Omit<ComputerUseSession, 'ownerId'>;
|
||||
|
||||
type ComputerUseOwner = {
|
||||
id: string | number;
|
||||
};
|
||||
|
||||
type ComputerUseSettings = {
|
||||
enabled: boolean;
|
||||
};
|
||||
|
||||
type RuntimeReadiness = {
|
||||
nut: any | null;
|
||||
screenshot: any | null;
|
||||
nutInstalled: boolean;
|
||||
screenshotInstalled: boolean;
|
||||
installInProgress: boolean;
|
||||
installMessage: string | null;
|
||||
};
|
||||
|
||||
const sessions = new Map<string, ComputerUseSession>();
|
||||
let installPromise: Promise<{ success: boolean; message: string }> | null = null;
|
||||
let lastInstallMessage: string | null = null;
|
||||
|
||||
const DEFAULT_SETTINGS: ComputerUseSettings = {
|
||||
enabled: false,
|
||||
};
|
||||
const AGENT_OWNER_ID = 'agent';
|
||||
const MCP_SERVER_NAME = 'cloudcli-computer-use';
|
||||
const MCP_PROVIDERS = ['claude', 'codex', 'cursor', 'gemini', 'opencode'];
|
||||
|
||||
function getRuntime(): ComputerUseRuntime {
|
||||
return IS_PLATFORM ? 'cloud' : 'local';
|
||||
}
|
||||
|
||||
function readSettings(): ComputerUseSettings {
|
||||
try {
|
||||
const raw = appConfigDb.get(COMPUTER_USE_SETTINGS_KEY);
|
||||
if (!raw) {
|
||||
return DEFAULT_SETTINGS;
|
||||
}
|
||||
|
||||
const parsed = JSON.parse(raw) as Partial<ComputerUseSettings>;
|
||||
return {
|
||||
enabled: parsed.enabled === true,
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.warn('[Computer Use] Failed to read settings:', error?.message || error);
|
||||
return DEFAULT_SETTINGS;
|
||||
}
|
||||
}
|
||||
|
||||
function writeSettings(settings: ComputerUseSettings): ComputerUseSettings {
|
||||
const normalized = {
|
||||
enabled: settings.enabled === true,
|
||||
};
|
||||
|
||||
appConfigDb.set(COMPUTER_USE_SETTINGS_KEY, JSON.stringify(normalized));
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function getOrCreateMcpToken(): string {
|
||||
const existing = appConfigDb.get(COMPUTER_USE_MCP_TOKEN_KEY);
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
const token = randomBytes(32).toString('hex');
|
||||
appConfigDb.set(COMPUTER_USE_MCP_TOKEN_KEY, token);
|
||||
return token;
|
||||
}
|
||||
|
||||
function getSetupMessage(settings: ComputerUseSettings, readiness: RuntimeReadiness): string {
|
||||
if (!settings.enabled) {
|
||||
return 'Computer Use is disabled in settings.';
|
||||
}
|
||||
if (getRuntime() === 'cloud') {
|
||||
return 'Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.';
|
||||
}
|
||||
if (!readiness.nutInstalled || !readiness.screenshotInstalled) {
|
||||
return 'Install the desktop control runtime to capture the screen and drive the mouse and keyboard.';
|
||||
}
|
||||
return readiness.installMessage || 'Computer Use runtime is not ready.';
|
||||
}
|
||||
|
||||
function getMcpCommand(): { command: string; args: string[] } {
|
||||
const serverDir = path.resolve(__dirname, '..', '..');
|
||||
const mcpScriptPath = path.join(serverDir, 'computer-use-mcp.js');
|
||||
if (fs.existsSync(mcpScriptPath)) {
|
||||
return {
|
||||
command: process.execPath,
|
||||
args: [mcpScriptPath],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
command: 'cloudcli',
|
||||
args: ['computer-use-mcp'],
|
||||
};
|
||||
}
|
||||
|
||||
function getMcpApiUrl(): string {
|
||||
const port = process.env.SERVER_PORT || process.env.PORT || '3001';
|
||||
return `http://127.0.0.1:${port}/api/computer-use-mcp`;
|
||||
}
|
||||
|
||||
function getRuntimeReadiness(): RuntimeReadiness {
|
||||
const base = getExecutorReadiness();
|
||||
return {
|
||||
...base,
|
||||
installInProgress: Boolean(installPromise),
|
||||
installMessage: lastInstallMessage,
|
||||
};
|
||||
}
|
||||
|
||||
function runCommand(command: string, args: string[]): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
cwd: process.cwd(),
|
||||
env: process.env,
|
||||
shell: false,
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
const output: string[] = [];
|
||||
|
||||
child.stdout.on('data', (chunk) => output.push(String(chunk)));
|
||||
child.stderr.on('data', (chunk) => output.push(String(chunk)));
|
||||
child.on('error', reject);
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
|
||||
reject(new Error(output.join('').trim() || `${command} ${args.join(' ')} exited with code ${code}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function formatInstallError(error: unknown): string {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
if (process.platform === 'linux' && /libxtst|x11|xtst|libpng|imagemagick|scrot/i.test(message)) {
|
||||
return [
|
||||
'Installing the desktop control runtime needs system packages.',
|
||||
'On Debian/Ubuntu run: sudo apt-get install -y libxtst-dev libpng-dev imagemagick',
|
||||
'then try again.',
|
||||
].join(' ');
|
||||
}
|
||||
return message || 'Failed to install the Computer Use runtime.';
|
||||
}
|
||||
|
||||
function isPackagedElectronNodeRuntime(): boolean {
|
||||
return process.env.ELECTRON_RUN_AS_NODE === '1' && Boolean(process.versions.electron);
|
||||
}
|
||||
|
||||
async function installRuntime(): Promise<{ success: boolean; message: string }> {
|
||||
if (installPromise) {
|
||||
return installPromise;
|
||||
}
|
||||
|
||||
const readiness = getExecutorReadiness();
|
||||
if (readiness.nutInstalled && readiness.screenshotInstalled) {
|
||||
lastInstallMessage = 'Computer Use runtime is available.';
|
||||
return { success: true, message: lastInstallMessage };
|
||||
}
|
||||
|
||||
if (isPackagedElectronNodeRuntime()) {
|
||||
lastInstallMessage = 'Computer Use runtime was not bundled with this desktop build.';
|
||||
return { success: false, message: lastInstallMessage };
|
||||
}
|
||||
|
||||
const npmCommand = process.platform === 'win32' ? 'npm.cmd' : 'npm';
|
||||
installPromise = (async () => {
|
||||
try {
|
||||
lastInstallMessage = 'Installing desktop control runtime…';
|
||||
await runCommand(npmCommand, [
|
||||
'install',
|
||||
'--no-save',
|
||||
'--no-package-lock',
|
||||
'@nut-tree-fork/nut-js',
|
||||
'screenshot-desktop',
|
||||
]);
|
||||
|
||||
lastInstallMessage = 'Computer Use runtime installed.';
|
||||
return { success: true, message: lastInstallMessage };
|
||||
} catch (error) {
|
||||
lastInstallMessage = formatInstallError(error);
|
||||
return { success: false, message: lastInstallMessage };
|
||||
}
|
||||
})();
|
||||
|
||||
try {
|
||||
return await installPromise;
|
||||
} finally {
|
||||
installPromise = null;
|
||||
}
|
||||
}
|
||||
|
||||
function getOwnerId(owner: ComputerUseOwner): string {
|
||||
if (owner.id === undefined || owner.id === null || String(owner.id).trim() === '') {
|
||||
throw new Error('Authenticated user is required.');
|
||||
}
|
||||
|
||||
return String(owner.id);
|
||||
}
|
||||
|
||||
function publicSession(session: ComputerUseSession): PublicComputerUseSession {
|
||||
const { ownerId: _ownerId, ...publicFields } = session;
|
||||
return publicFields;
|
||||
}
|
||||
|
||||
function ownerSessions(ownerId: string): ComputerUseSession[] {
|
||||
return [...sessions.values()].filter((session) => session.ownerId === ownerId);
|
||||
}
|
||||
|
||||
function canAccessSession(ownerId: string, session: ComputerUseSession): boolean {
|
||||
return session.ownerId === ownerId || session.ownerId === AGENT_OWNER_ID;
|
||||
}
|
||||
|
||||
function normalizeSessionId(sessionId?: string | null): string | null {
|
||||
if (typeof sessionId !== 'string') {
|
||||
return null;
|
||||
}
|
||||
const trimmed = sessionId.trim();
|
||||
return trimmed ? trimmed : null;
|
||||
}
|
||||
|
||||
function findActiveAgentSession(): ComputerUseSession | null {
|
||||
return ownerSessions(AGENT_OWNER_ID)
|
||||
.filter((session) => session.status === 'ready')
|
||||
.sort((a, b) => Date.parse(b.updatedAt) - Date.parse(a.updatedAt))[0] || null;
|
||||
}
|
||||
|
||||
function positiveDuration(value: number, fallback: number): number {
|
||||
return Number.isFinite(value) && value > 0 ? value : fallback;
|
||||
}
|
||||
|
||||
async function expireStaleSessions(now = Date.now()): Promise<void> {
|
||||
const sessionTtl = positiveDuration(SESSION_TTL_MS, 30 * 60 * 1000);
|
||||
const stoppedRetention = positiveDuration(STOPPED_SESSION_RETENTION_MS, sessionTtl);
|
||||
|
||||
for (const [sessionId, session] of sessions.entries()) {
|
||||
const updatedAt = Date.parse(session.updatedAt);
|
||||
if (!Number.isFinite(updatedAt)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (session.status === 'ready') {
|
||||
if (now - updatedAt <= sessionTtl) {
|
||||
continue;
|
||||
}
|
||||
session.status = 'stopped';
|
||||
session.agentAccessEnabled = false;
|
||||
session.updatedAt = new Date(now).toISOString();
|
||||
session.lastAction = 'expire';
|
||||
session.message = 'Computer Use session expired after inactivity.';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (now - updatedAt > stoppedRetention) {
|
||||
sessions.delete(sessionId);
|
||||
}
|
||||
}
|
||||
|
||||
const maxStoredSessions = Number.isFinite(MAX_STORED_SESSIONS) && MAX_STORED_SESSIONS > 0
|
||||
? MAX_STORED_SESSIONS
|
||||
: 100;
|
||||
if (sessions.size <= maxStoredSessions) {
|
||||
return;
|
||||
}
|
||||
|
||||
const removable = [...sessions.values()]
|
||||
.filter((session) => session.status !== 'ready')
|
||||
.sort((a, b) => Date.parse(a.updatedAt) - Date.parse(b.updatedAt));
|
||||
for (const session of removable) {
|
||||
if (sessions.size <= maxStoredSessions) {
|
||||
break;
|
||||
}
|
||||
sessions.delete(session.id);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Action layer: local executor (OSS) or cloud relay to the desktop agent --
|
||||
//
|
||||
// Every desktop interaction goes through `performAction` / `getCursorPosition`.
|
||||
// In local mode it drives the in-process nut-js executor (computer-executor.ts);
|
||||
// in cloud mode it forwards the action to the linked desktop agent over
|
||||
// `desktopAgentRelay` and applies the returned screenshot. The local server
|
||||
// itself never touches the OS in cloud mode.
|
||||
|
||||
/** Shape the desktop agent returns for any relayed action. */
|
||||
type RelayResult = {
|
||||
screenshotDataUrl?: string | null;
|
||||
displaySize?: { width: number; height: number } | null;
|
||||
cursor?: { x: number; y: number } | null;
|
||||
position?: Point | null;
|
||||
};
|
||||
|
||||
function applyRelayResult(session: ComputerUseSession, result: RelayResult): void {
|
||||
if (typeof result.screenshotDataUrl === 'string') {
|
||||
session.screenshotDataUrl = result.screenshotDataUrl;
|
||||
}
|
||||
if (result.displaySize) {
|
||||
session.displaySize = result.displaySize;
|
||||
}
|
||||
if (result.cursor) {
|
||||
session.cursor = { x: result.cursor.x, y: result.cursor.y, actor: session.cursor?.actor ?? 'agent' };
|
||||
}
|
||||
session.updatedAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
function stripSessionArgs(args: Record<string, unknown>): Record<string, unknown> {
|
||||
const { sessionId: _sessionId, ...toolArgs } = args;
|
||||
return toolArgs;
|
||||
}
|
||||
|
||||
async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
|
||||
if (getRuntime() === 'cloud') {
|
||||
const result = (await desktopAgentRelay.relay('screenshot', { sessionId: session.id })) as RelayResult;
|
||||
applyRelayResult(session, result);
|
||||
return;
|
||||
}
|
||||
applyRelayResult(session, await runRawComputerAction({ type: 'screenshot' }, session));
|
||||
}
|
||||
|
||||
/** Runs one action and refreshes the session screenshot afterwards. */
|
||||
async function performAction(session: ComputerUseSession, action: RawComputerAction): Promise<void> {
|
||||
if (getRuntime() === 'cloud') {
|
||||
const result = (await desktopAgentRelay.relay(action.type, {
|
||||
...action,
|
||||
sessionId: session.id,
|
||||
displaySize: session.displaySize,
|
||||
})) as RelayResult;
|
||||
applyRelayResult(session, result);
|
||||
return;
|
||||
}
|
||||
|
||||
applyRelayResult(session, await runRawComputerAction(action, session));
|
||||
}
|
||||
|
||||
/** Reads the current cursor position in screenshot-pixel space. */
|
||||
async function getCursorPosition(session: ComputerUseSession): Promise<Point> {
|
||||
if (getRuntime() === 'cloud') {
|
||||
const result = (await desktopAgentRelay.relay('cursor_position', {
|
||||
sessionId: session.id,
|
||||
displaySize: session.displaySize,
|
||||
})) as RelayResult;
|
||||
applyRelayResult(session, result);
|
||||
if (result.position) {
|
||||
return result.position;
|
||||
}
|
||||
return session.cursor ? { x: session.cursor.x, y: session.cursor.y } : { x: 0, y: 0 };
|
||||
}
|
||||
const result = await runRawComputerAction({ type: 'cursor_position' }, session);
|
||||
applyRelayResult(session, result);
|
||||
return result.position || session.cursor || { x: 0, y: 0 };
|
||||
}
|
||||
|
||||
function assertReady(session: ComputerUseSession): void {
|
||||
if (session.status !== 'ready') {
|
||||
throw new Error(session.message || 'Computer Use session is not available.');
|
||||
}
|
||||
}
|
||||
|
||||
function agentToolsAvailable(): boolean {
|
||||
const settings = readSettings();
|
||||
if (!settings.enabled) {
|
||||
return false;
|
||||
}
|
||||
if (getRuntime() === 'cloud') {
|
||||
return desktopAgentRelay.isConnected();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function assertAgentToolsAvailable(): void {
|
||||
if (agentToolsAvailable()) {
|
||||
return;
|
||||
}
|
||||
const settings = readSettings();
|
||||
if (!settings.enabled) {
|
||||
throw new Error('Computer Use agent tools are disabled.');
|
||||
}
|
||||
throw new Error(
|
||||
getRuntime() === 'cloud'
|
||||
? 'No desktop is linked. Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.'
|
||||
: 'Computer Use agent tools are disabled.'
|
||||
);
|
||||
}
|
||||
|
||||
function stopSessions(lastAction: string, message: string): void {
|
||||
for (const session of sessions.values()) {
|
||||
session.status = 'stopped';
|
||||
session.agentAccessEnabled = false;
|
||||
session.updatedAt = new Date().toISOString();
|
||||
session.lastAction = lastAction;
|
||||
session.message = message;
|
||||
}
|
||||
}
|
||||
|
||||
export const computerUseService = {
|
||||
async getSettings() {
|
||||
return readSettings();
|
||||
},
|
||||
|
||||
async updateSettings(settings: Partial<ComputerUseSettings>) {
|
||||
const current = readSettings();
|
||||
const enabled = typeof settings.enabled === 'boolean' ? settings.enabled : current.enabled;
|
||||
const next = writeSettings({ enabled });
|
||||
if (next.enabled) {
|
||||
await this.registerAgentMcp();
|
||||
} else {
|
||||
await this.unregisterAgentMcp();
|
||||
desktopAgentRelay.disconnectAll('Computer Use was disabled in this environment.');
|
||||
stopSessions('settings:disabled', 'Computer Use was disabled in settings.');
|
||||
}
|
||||
return next;
|
||||
},
|
||||
|
||||
async getStatus() {
|
||||
const settings = readSettings();
|
||||
const readiness = getRuntimeReadiness();
|
||||
const isCloud = getRuntime() === 'cloud';
|
||||
const runtimeReady = readiness.nutInstalled && readiness.screenshotInstalled;
|
||||
// Cloud mode still respects the saved feature setting. When enabled, cloud
|
||||
// availability comes from a linked desktop agent because the hosted server
|
||||
// has no screen of its own.
|
||||
const desktopAgentConnected = desktopAgentRelay.isConnected();
|
||||
const available = settings.enabled && (isCloud
|
||||
? desktopAgentConnected
|
||||
: runtimeReady);
|
||||
|
||||
return {
|
||||
enabled: settings.enabled,
|
||||
runtime: getRuntime(),
|
||||
available,
|
||||
desktopAgentConnected,
|
||||
desktopAgentCount: desktopAgentRelay.connectedCount(),
|
||||
nutInstalled: readiness.nutInstalled,
|
||||
screenshotInstalled: readiness.screenshotInstalled,
|
||||
installInProgress: readiness.installInProgress,
|
||||
sessionCount: sessions.size,
|
||||
message: available ? 'Computer Use runtime is available.' : getSetupMessage(settings, readiness),
|
||||
};
|
||||
},
|
||||
|
||||
async registerAgentMcp() {
|
||||
const { command, args } = getMcpCommand();
|
||||
const results = await providerMcpService.addMcpServerToAllProviders({
|
||||
name: MCP_SERVER_NAME,
|
||||
scope: 'user',
|
||||
transport: 'stdio',
|
||||
command,
|
||||
args,
|
||||
env: {
|
||||
CLOUDCLI_COMPUTER_USE_MCP_TOKEN: getOrCreateMcpToken(),
|
||||
CLOUDCLI_COMPUTER_USE_API_URL: getMcpApiUrl(),
|
||||
},
|
||||
});
|
||||
return { name: MCP_SERVER_NAME, command, args, results };
|
||||
},
|
||||
|
||||
getMcpToken() {
|
||||
return getOrCreateMcpToken();
|
||||
},
|
||||
|
||||
async unregisterAgentMcp() {
|
||||
const results = await Promise.all(MCP_PROVIDERS.map(async (provider) => {
|
||||
try {
|
||||
const result = await providerMcpService.removeProviderMcpServer(provider, {
|
||||
name: MCP_SERVER_NAME,
|
||||
scope: 'user',
|
||||
});
|
||||
return { provider, removed: result.removed };
|
||||
} catch (error) {
|
||||
return {
|
||||
provider,
|
||||
removed: false,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
};
|
||||
}
|
||||
}));
|
||||
return { name: MCP_SERVER_NAME, results };
|
||||
},
|
||||
|
||||
async installRuntime() {
|
||||
const result = await installRuntime();
|
||||
return {
|
||||
...result,
|
||||
status: await this.getStatus(),
|
||||
};
|
||||
},
|
||||
|
||||
async listSessions(owner: ComputerUseOwner) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
await expireStaleSessions();
|
||||
return [...sessions.values()]
|
||||
.filter((session) => canAccessSession(ownerId, session))
|
||||
.map(publicSession);
|
||||
},
|
||||
|
||||
async createSession(owner: ComputerUseOwner, options?: { createdBy?: 'user' | 'agent' }) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
await expireStaleSessions();
|
||||
const createdBy = options?.createdBy ?? 'user';
|
||||
|
||||
const now = new Date().toISOString();
|
||||
const session: ComputerUseSession = {
|
||||
id: randomUUID(),
|
||||
ownerId,
|
||||
createdBy,
|
||||
runtime: getRuntime(),
|
||||
status: 'unavailable',
|
||||
screenshotDataUrl: null,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
lastAction: 'create',
|
||||
// Consent is always OFF at creation — the user must explicitly grant control,
|
||||
// even for agent-initiated sessions controlling the full desktop.
|
||||
agentAccessEnabled: false,
|
||||
displaySize: null,
|
||||
message: null,
|
||||
cursor: null,
|
||||
};
|
||||
|
||||
const activeOwnerSessions = ownerSessions(ownerId).filter((item) => item.status === 'ready');
|
||||
if (activeOwnerSessions.length >= MAX_SESSIONS_PER_OWNER) {
|
||||
throw new Error(`Computer Use is limited to ${MAX_SESSIONS_PER_OWNER} active session(s).`);
|
||||
}
|
||||
|
||||
const settings = readSettings();
|
||||
const readiness = getRuntimeReadiness();
|
||||
const isCloud = getRuntime() === 'cloud';
|
||||
const runtimeReady = readiness.nutInstalled && readiness.screenshotInstalled;
|
||||
const ready = settings.enabled && (isCloud
|
||||
? desktopAgentRelay.isConnected()
|
||||
: runtimeReady);
|
||||
|
||||
if (!ready) {
|
||||
session.message = getSetupMessage(settings, readiness);
|
||||
sessions.set(session.id, session);
|
||||
return publicSession(session);
|
||||
}
|
||||
|
||||
// In cloud mode the linked desktop agent is the consent authority and prompts
|
||||
// the user per its own consent mode, so the relay is allowed to act. In local
|
||||
// mode the user must still grant control from the panel.
|
||||
if (isCloud) {
|
||||
session.agentAccessEnabled = true;
|
||||
}
|
||||
|
||||
session.status = 'ready';
|
||||
session.message = isCloud
|
||||
? 'Computer Use session is ready on the linked desktop.'
|
||||
: 'Computer Use session is ready. Grant control to let agents act.';
|
||||
sessions.set(session.id, session);
|
||||
try {
|
||||
await refreshScreenshot(session);
|
||||
} catch (error) {
|
||||
session.status = 'unavailable';
|
||||
session.message = error instanceof Error ? error.message : 'Failed to capture the screen.';
|
||||
}
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async grantAgentAccess(owner: ComputerUseOwner, sessionId: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
session.agentAccessEnabled = true;
|
||||
session.updatedAt = new Date().toISOString();
|
||||
session.lastAction = 'consent:grant';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async revokeAgentAccess(owner: ComputerUseOwner, sessionId: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
session.agentAccessEnabled = false;
|
||||
session.updatedAt = new Date().toISOString();
|
||||
session.lastAction = 'consent:revoke';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async stopSession(owner: ComputerUseOwner, sessionId: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
return { stopped: false };
|
||||
}
|
||||
|
||||
session.status = 'stopped';
|
||||
session.agentAccessEnabled = false;
|
||||
session.updatedAt = new Date().toISOString();
|
||||
session.lastAction = 'stop';
|
||||
session.message = 'Computer Use session stopped. Agent control is revoked.';
|
||||
if (getRuntime() === 'cloud' && desktopAgentRelay.isConnected()) {
|
||||
// Best-effort: tell the desktop agent to forget this session's consent.
|
||||
void desktopAgentRelay.relay('stop_session', { sessionId }).catch(() => undefined);
|
||||
}
|
||||
return { stopped: true, session: publicSession(session) };
|
||||
},
|
||||
|
||||
async deleteSession(owner: ComputerUseOwner, sessionId: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
return { deleted: false };
|
||||
}
|
||||
|
||||
sessions.delete(sessionId);
|
||||
return { deleted: true, sessionId };
|
||||
},
|
||||
|
||||
// --- User-initiated actions (from the panel) -------------------------------
|
||||
|
||||
async userScreenshot(owner: ComputerUseOwner, sessionId: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
assertReady(session);
|
||||
await refreshScreenshot(session);
|
||||
session.lastAction = 'screenshot';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async userClick(owner: ComputerUseOwner, sessionId: string, input: { x: number; y: number; button?: ClickButton; double?: boolean }) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
assertReady(session);
|
||||
await performAction(session, {
|
||||
type: 'click',
|
||||
button: input.button || 'left',
|
||||
point: { x: input.x, y: input.y },
|
||||
double: input.double === true,
|
||||
});
|
||||
session.cursor = { x: input.x, y: input.y, actor: 'user' };
|
||||
session.lastAction = input.double ? 'double_click' : 'click';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async userPressKey(owner: ComputerUseOwner, sessionId: string, key: string) {
|
||||
const ownerId = getOwnerId(owner);
|
||||
const session = sessions.get(sessionId);
|
||||
if (!session || !canAccessSession(ownerId, session)) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
assertReady(session);
|
||||
await performAction(session, { type: 'key', key });
|
||||
session.lastAction = `key:${key}`;
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
// --- Agent-initiated actions (via MCP) ------------------------------------
|
||||
|
||||
/**
|
||||
* Resolves a session the agent is allowed to act on. In local mode this
|
||||
* enforces the in-process per-session consent flag. In cloud mode the linked
|
||||
* desktop agent is the consent authority (it prompts the user per its own
|
||||
* consent mode), so this only requires the relay to be connected.
|
||||
*/
|
||||
async getOrCreateAgentSession(): Promise<ComputerUseSession> {
|
||||
assertAgentToolsAvailable();
|
||||
await expireStaleSessions();
|
||||
const existing = findActiveAgentSession();
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
const created = await this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
|
||||
const session = sessions.get(created.id);
|
||||
if (!session) {
|
||||
throw new Error('Computer Use session could not be created.');
|
||||
}
|
||||
return session;
|
||||
},
|
||||
|
||||
async getConsentedSession(sessionId?: string): Promise<ComputerUseSession> {
|
||||
assertAgentToolsAvailable();
|
||||
const normalizedSessionId = normalizeSessionId(sessionId);
|
||||
const session = normalizedSessionId
|
||||
? sessions.get(normalizedSessionId)
|
||||
: await this.getOrCreateAgentSession();
|
||||
if (!session) {
|
||||
throw new Error('Computer Use session not found.');
|
||||
}
|
||||
if (getRuntime() !== 'cloud' && !session.agentAccessEnabled) {
|
||||
throw new Error(`Computer Use session ${session.id} is awaiting user consent. Ask the user to grant control in the Computer panel.`);
|
||||
}
|
||||
assertReady(session);
|
||||
return session;
|
||||
},
|
||||
|
||||
async agentScreenshot(sessionId?: string) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await refreshScreenshot(session);
|
||||
session.lastAction = 'screenshot';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentCursorPosition(sessionId?: string) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
const point = await getCursorPosition(session);
|
||||
session.cursor = { ...point, actor: 'agent' };
|
||||
session.lastAction = 'cursor_position';
|
||||
return { session: publicSession(session), position: point };
|
||||
},
|
||||
|
||||
async agentMouseMove(sessionId: string | undefined, point: Point) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await performAction(session, { type: 'mouse_move', point });
|
||||
session.cursor = { ...point, actor: 'agent' };
|
||||
session.lastAction = 'mouse_move';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentUnifiedClick(sessionId: string | undefined, input: { button?: ClickButton; point?: Point; clickCount?: number }) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
const button = input.button || 'left';
|
||||
const clickCount = Math.max(1, Math.min(Math.trunc(input.clickCount || 1), 5));
|
||||
for (let index = 0; index < clickCount; index += 1) {
|
||||
await performAction(session, { type: 'click', button, point: input.point, double: false });
|
||||
}
|
||||
if (input.point) {
|
||||
session.cursor = { ...input.point, actor: 'agent' };
|
||||
}
|
||||
session.lastAction = clickCount > 1 ? `${button}_click:${clickCount}` : `${button}_click`;
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentDrag(sessionId: string | undefined, from: Point, to: Point, button: ClickButton = 'left') {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await performAction(session, { type: 'drag', from, to, button });
|
||||
session.cursor = { ...to, actor: 'agent' };
|
||||
session.lastAction = `${button}_drag`;
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentType(sessionId: string | undefined, text: string) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await performAction(session, { type: 'type', text });
|
||||
session.lastAction = 'type';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentKey(sessionId: string | undefined, key: string) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await performAction(session, { type: 'key', key });
|
||||
session.lastAction = `key:${key}`;
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentScroll(sessionId: string | undefined, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
const point = typeof input.x === 'number' && typeof input.y === 'number' ? { x: input.x, y: input.y } : undefined;
|
||||
await performAction(session, { type: 'scroll', direction: input.direction, amount: input.amount, point });
|
||||
if (point) {
|
||||
session.cursor = { ...point, actor: 'agent' };
|
||||
}
|
||||
session.lastAction = `scroll:${input.direction}`;
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentWait(sessionId?: string, timeoutMs?: number) {
|
||||
const session = await this.getConsentedSession(sessionId);
|
||||
await performAction(session, { type: 'wait', ms: timeoutMs });
|
||||
session.lastAction = 'wait';
|
||||
return publicSession(session);
|
||||
},
|
||||
|
||||
async agentStopSession(sessionId?: string) {
|
||||
assertAgentToolsAvailable();
|
||||
const normalizedSessionId = normalizeSessionId(sessionId);
|
||||
if (normalizedSessionId) {
|
||||
return this.stopSession({ id: AGENT_OWNER_ID }, normalizedSessionId);
|
||||
}
|
||||
|
||||
await expireStaleSessions();
|
||||
const existing = findActiveAgentSession();
|
||||
if (!existing) {
|
||||
return { stopped: false };
|
||||
}
|
||||
return this.stopSession({ id: AGENT_OWNER_ID }, existing.id);
|
||||
},
|
||||
|
||||
async callSemanticTool(toolName: string, args: Record<string, unknown>) {
|
||||
if (!semanticOperationNames.has(toolName)) {
|
||||
throw new Error(`Unsupported semantic Computer Use tool: ${toolName}`);
|
||||
}
|
||||
|
||||
const sessionId = typeof args.sessionId === 'string' ? args.sessionId : undefined;
|
||||
const session = await this.getConsentedSession(normalizeSessionId(sessionId) ?? undefined);
|
||||
const toolArgs = { ...stripSessionArgs(args), sessionId: session.id };
|
||||
const semanticResult = getRuntime() === 'cloud'
|
||||
? await desktopAgentRelay.relay('semantic_tool', {
|
||||
sessionId: session.id,
|
||||
displaySize: session.displaySize,
|
||||
toolName,
|
||||
arguments: toolArgs,
|
||||
})
|
||||
: await computerSemanticsService.callTool(toolName, toolArgs);
|
||||
|
||||
applyRelayResult(session, semanticResult as RelayResult);
|
||||
session.lastAction = `semantic:${toolName}`;
|
||||
return { session: publicSession(session), result: semanticResult };
|
||||
},
|
||||
|
||||
/**
|
||||
* Cloud only: when a desktop agent links to this hosted environment, expose
|
||||
* the computer_* MCP tools only if the user enabled Computer Use in settings.
|
||||
*/
|
||||
async onDesktopAgentConnected() {
|
||||
if (getRuntime() !== 'cloud') {
|
||||
return;
|
||||
}
|
||||
if (!readSettings().enabled) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.registerAgentMcp();
|
||||
} catch (error) {
|
||||
console.warn('[Computer Use] Failed to register MCP for linked desktop agent:', error instanceof Error ? error.message : error);
|
||||
}
|
||||
},
|
||||
|
||||
/** Cloud only: tear down sessions when the last desktop agent disconnects. */
|
||||
async onDesktopAgentDisconnected() {
|
||||
if (getRuntime() !== 'cloud' || desktopAgentRelay.isConnected()) {
|
||||
return;
|
||||
}
|
||||
for (const session of sessions.values()) {
|
||||
if (session.status === 'ready') {
|
||||
session.status = 'stopped';
|
||||
session.agentAccessEnabled = false;
|
||||
session.updatedAt = new Date().toISOString();
|
||||
session.lastAction = 'agent-disconnected';
|
||||
session.message = 'The linked desktop agent disconnected.';
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
async stopAllSessions() {
|
||||
stopSessions('shutdown', 'Computer Use session stopped during server shutdown.');
|
||||
},
|
||||
};
|
||||
|
||||
// Drive cloud MCP exposure + session teardown off desktop-agent connectivity.
|
||||
desktopAgentRelay.setHooks({
|
||||
canAcceptConnection: () => getRuntime() === 'cloud' && readSettings().enabled,
|
||||
onFirstConnect: () => computerUseService.onDesktopAgentConnected(),
|
||||
onLastDisconnect: () => computerUseService.onDesktopAgentDisconnected(),
|
||||
});
|
||||
|
||||
process.once('beforeExit', () => {
|
||||
void computerUseService.stopAllSessions();
|
||||
});
|
||||
158
server/modules/computer-use/desktop-agent-relay.service.ts
Normal file
158
server/modules/computer-use/desktop-agent-relay.service.ts
Normal file
@@ -0,0 +1,158 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
import type { WebSocket } from 'ws';
|
||||
|
||||
const RELAY_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_RELAY_TIMEOUT_MS || '60000', 10);
|
||||
const WS_OPEN = 1;
|
||||
|
||||
type PendingRelay = {
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (reason: Error) => void;
|
||||
timer: ReturnType<typeof setTimeout>;
|
||||
};
|
||||
|
||||
type ConnectedAgent = {
|
||||
ws: WebSocket;
|
||||
label: string;
|
||||
registeredAt: string;
|
||||
};
|
||||
|
||||
type RelayLifecycleHooks = {
|
||||
canAcceptConnection?: () => boolean;
|
||||
onFirstConnect?: () => void | Promise<void>;
|
||||
onLastDisconnect?: () => void | Promise<void>;
|
||||
};
|
||||
|
||||
const agents = new Map<WebSocket, ConnectedAgent>();
|
||||
const pending = new Map<string, PendingRelay>();
|
||||
let hooks: RelayLifecycleHooks = {};
|
||||
|
||||
function rejectAllPending(reason: string): void {
|
||||
for (const [callId, call] of pending.entries()) {
|
||||
clearTimeout(call.timer);
|
||||
call.reject(new Error(reason));
|
||||
pending.delete(callId);
|
||||
}
|
||||
}
|
||||
|
||||
function pickAgent(): ConnectedAgent | undefined {
|
||||
for (const agent of agents.values()) {
|
||||
if (agent.ws.readyState === WS_OPEN) {
|
||||
return agent;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloud-side registry of linked desktop agents and the request/response relay
|
||||
* used to drive the user's real desktop. The hosted server never touches the OS
|
||||
* itself — it only forwards `computer_*` actions to a connected desktop agent
|
||||
* and awaits the screenshot it returns.
|
||||
*/
|
||||
export const desktopAgentRelay = {
|
||||
setHooks(next: RelayLifecycleHooks): void {
|
||||
hooks = next;
|
||||
},
|
||||
|
||||
register(ws: WebSocket, label = 'desktop-agent'): boolean {
|
||||
if (hooks.canAcceptConnection && !hooks.canAcceptConnection()) {
|
||||
console.log(`[DesktopAgent] Rejected (${label}); Computer Use is disabled.`);
|
||||
try {
|
||||
ws.close(1008, 'Computer Use is disabled in this environment.');
|
||||
} catch {
|
||||
// ignore close failures
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const wasEmpty = pickAgent() === undefined;
|
||||
agents.set(ws, { ws, label, registeredAt: new Date().toISOString() });
|
||||
console.log(`[DesktopAgent] Registered (${label}); ${agents.size} connected.`);
|
||||
|
||||
ws.on('close', () => {
|
||||
const wasRegistered = agents.delete(ws);
|
||||
console.log(`[DesktopAgent] Disconnected (${label}); ${agents.size} remain.`);
|
||||
if (wasRegistered && pickAgent() === undefined) {
|
||||
rejectAllPending('Desktop agent disconnected.');
|
||||
void hooks.onLastDisconnect?.();
|
||||
}
|
||||
});
|
||||
|
||||
if (wasEmpty) {
|
||||
void hooks.onFirstConnect?.();
|
||||
}
|
||||
return true;
|
||||
},
|
||||
|
||||
disconnectAll(reason = 'Desktop agent disconnected.'): void {
|
||||
const hadAgent = pickAgent() !== undefined;
|
||||
const sockets = [...agents.keys()];
|
||||
agents.clear();
|
||||
for (const ws of sockets) {
|
||||
try {
|
||||
ws.close(1008, reason);
|
||||
} catch {
|
||||
// ignore close failures
|
||||
}
|
||||
}
|
||||
rejectAllPending(reason);
|
||||
if (hadAgent) {
|
||||
void hooks.onLastDisconnect?.();
|
||||
}
|
||||
},
|
||||
|
||||
/** Resolves a pending relay call with the desktop agent's reply. */
|
||||
handleResult(id: string, result: unknown, error?: string): void {
|
||||
const call = pending.get(id);
|
||||
if (!call) {
|
||||
return;
|
||||
}
|
||||
clearTimeout(call.timer);
|
||||
pending.delete(id);
|
||||
if (error) {
|
||||
call.reject(new Error(error));
|
||||
} else {
|
||||
call.resolve(result);
|
||||
}
|
||||
},
|
||||
|
||||
isConnected(): boolean {
|
||||
return pickAgent() !== undefined;
|
||||
},
|
||||
|
||||
connectedCount(): number {
|
||||
let count = 0;
|
||||
for (const agent of agents.values()) {
|
||||
if (agent.ws.readyState === WS_OPEN) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
},
|
||||
|
||||
async relay(type: string, params: Record<string, unknown>): Promise<unknown> {
|
||||
const agent = pickAgent();
|
||||
if (!agent) {
|
||||
throw new Error(
|
||||
'No desktop is linked. Open CloudCLI Desktop on this computer, connect the same account, and enable Computer Use.'
|
||||
);
|
||||
}
|
||||
|
||||
const id = randomUUID();
|
||||
return new Promise<unknown>((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
pending.delete(id);
|
||||
reject(new Error('Desktop agent did not respond in time.'));
|
||||
}, RELAY_TIMEOUT_MS);
|
||||
pending.set(id, { resolve, reject, timer });
|
||||
try {
|
||||
agent.ws.send(JSON.stringify({ kind: 'computer_relay', id, type, params }));
|
||||
} catch (error) {
|
||||
clearTimeout(timer);
|
||||
pending.delete(id);
|
||||
reject(error instanceof Error ? error : new Error('Failed to send to desktop agent.'));
|
||||
}
|
||||
});
|
||||
},
|
||||
};
|
||||
2
server/modules/computer-use/index.ts
Normal file
2
server/modules/computer-use/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export { computerUseService } from '@/modules/computer-use/computer-use.service.js';
|
||||
export { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';
|
||||
@@ -0,0 +1,82 @@
|
||||
import { SemanticHelperProcess } from '@/modules/computer-use/semantics/helpers/semantic-helper-process.js';
|
||||
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
|
||||
import type { SemanticAdapter, SemanticAdapterCapabilities } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
|
||||
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
type HelperMethod =
|
||||
| 'list_apps'
|
||||
| 'get_app_state'
|
||||
| 'click_element'
|
||||
| 'perform_secondary_action'
|
||||
| 'set_value'
|
||||
| 'type_text'
|
||||
| 'press_key'
|
||||
| 'scroll_element'
|
||||
| 'drag';
|
||||
|
||||
export class HelperSemanticAdapter implements SemanticAdapter {
|
||||
private helper: SemanticHelperProcess | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly platform: NodeJS.Platform,
|
||||
private readonly arch: NodeJS.Architecture = process.arch,
|
||||
) {}
|
||||
|
||||
capabilities(): SemanticAdapterCapabilities {
|
||||
return {
|
||||
platform: this.platform,
|
||||
appDiscovery: true,
|
||||
accessibilityTree: true,
|
||||
nativeElementActions: true,
|
||||
nativeValueSetting: true,
|
||||
targetedInput: true,
|
||||
};
|
||||
}
|
||||
|
||||
async listApps(): Promise<SemanticApp[]> {
|
||||
return await this.request('list_apps', {}) as SemanticApp[];
|
||||
}
|
||||
|
||||
async getAppState(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('get_app_state', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async clickElement(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('click_element', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('perform_secondary_action', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async setValue(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('set_value', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async typeText(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('type_text', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async pressKey(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('press_key', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async scrollElement(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('scroll_element', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async drag(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('drag', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
private async request(method: HelperMethod, params: Record<string, unknown>): Promise<unknown> {
|
||||
if (!this.helper) {
|
||||
const resolution = resolveSemanticHelper(this.platform, this.arch);
|
||||
if (!resolution.available || !resolution.path) {
|
||||
throw new Error(resolution.reason || `Semantic helper is unavailable for ${this.platform}-${this.arch}.`);
|
||||
}
|
||||
this.helper = new SemanticHelperProcess(resolution.path);
|
||||
}
|
||||
return this.helper.request(method, params);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
|
||||
|
||||
export function createMacOsSemanticAdapter(): HelperSemanticAdapter {
|
||||
return new HelperSemanticAdapter('darwin');
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
export type SemanticAdapterCapabilities = {
|
||||
platform: NodeJS.Platform;
|
||||
appDiscovery: boolean;
|
||||
accessibilityTree: boolean;
|
||||
nativeElementActions: boolean;
|
||||
nativeValueSetting: boolean;
|
||||
targetedInput: boolean;
|
||||
};
|
||||
|
||||
export type SemanticAdapter = {
|
||||
capabilities(): SemanticAdapterCapabilities;
|
||||
listApps(): Promise<SemanticApp[]>;
|
||||
getAppState(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
clickElement(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
setValue(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
typeText(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
pressKey(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
scrollElement(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
drag(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
};
|
||||
@@ -0,0 +1,5 @@
|
||||
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
|
||||
|
||||
export function createWindowsSemanticAdapter(): HelperSemanticAdapter {
|
||||
return new HelperSemanticAdapter('win32');
|
||||
}
|
||||
@@ -0,0 +1,467 @@
|
||||
import AppKit
|
||||
import ApplicationServices
|
||||
import Foundation
|
||||
|
||||
typealias JSON = [String: Any]
|
||||
|
||||
struct ElementRecord {
|
||||
let index: String
|
||||
let role: String
|
||||
let title: String?
|
||||
let value: String?
|
||||
let bounds: [String: Double]?
|
||||
let actions: [String]
|
||||
}
|
||||
|
||||
var stateElements: [String: [ElementRecord]] = [:]
|
||||
var stateAxElements: [String: [String: AXUIElement]] = [:]
|
||||
var stateOrder: [String] = []
|
||||
let maxStoredStates = 100
|
||||
|
||||
func jsonLine(_ object: Any) {
|
||||
guard JSONSerialization.isValidJSONObject(object),
|
||||
let data = try? JSONSerialization.data(withJSONObject: object),
|
||||
let text = String(data: data, encoding: .utf8)
|
||||
else {
|
||||
print("{\"error\":\"Failed to encode JSON\"}")
|
||||
fflush(stdout)
|
||||
return
|
||||
}
|
||||
print(text)
|
||||
fflush(stdout)
|
||||
}
|
||||
|
||||
func respond(id: Any?, result: Any) {
|
||||
jsonLine(["id": id ?? NSNull(), "result": result])
|
||||
}
|
||||
|
||||
func respondError(id: Any?, _ message: String) {
|
||||
jsonLine(["id": id ?? NSNull(), "error": message])
|
||||
}
|
||||
|
||||
func stringAttr(_ element: AXUIElement, _ attr: CFString) -> String? {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
|
||||
return value as? String
|
||||
}
|
||||
|
||||
func boolAttr(_ element: AXUIElement, _ attr: CFString) -> Bool? {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
|
||||
return value as? Bool
|
||||
}
|
||||
|
||||
func arrayAttr(_ element: AXUIElement, _ attr: CFString) -> [AXUIElement] {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return [] }
|
||||
return value as? [AXUIElement] ?? []
|
||||
}
|
||||
|
||||
func actions(_ element: AXUIElement) -> [String] {
|
||||
var names: CFArray?
|
||||
guard AXUIElementCopyActionNames(element, &names) == .success else { return [] }
|
||||
return names as? [String] ?? []
|
||||
}
|
||||
|
||||
func bounds(_ element: AXUIElement) -> [String: Double]? {
|
||||
var positionRef: CFTypeRef?
|
||||
var sizeRef: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionRef) == .success,
|
||||
AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeRef) == .success,
|
||||
let positionValue = positionRef,
|
||||
let sizeValue = sizeRef
|
||||
else { return nil }
|
||||
|
||||
var point = CGPoint.zero
|
||||
var size = CGSize.zero
|
||||
guard CFGetTypeID(positionValue) == AXValueGetTypeID(),
|
||||
CFGetTypeID(sizeValue) == AXValueGetTypeID()
|
||||
else { return nil }
|
||||
|
||||
let positionAxValue = positionValue as! AXValue
|
||||
let sizeAxValue = sizeValue as! AXValue
|
||||
guard AXValueGetValue(positionAxValue, .cgPoint, &point),
|
||||
AXValueGetValue(sizeAxValue, .cgSize, &size)
|
||||
else { return nil }
|
||||
|
||||
return [
|
||||
"x": Double(point.x),
|
||||
"y": Double(point.y),
|
||||
"width": Double(size.width),
|
||||
"height": Double(size.height),
|
||||
]
|
||||
}
|
||||
|
||||
func record(_ element: AXUIElement, index: String) -> ElementRecord {
|
||||
ElementRecord(
|
||||
index: index,
|
||||
role: stringAttr(element, kAXRoleAttribute as CFString) ?? "AXUnknown",
|
||||
title: stringAttr(element, kAXTitleAttribute as CFString) ?? stringAttr(element, kAXDescriptionAttribute as CFString),
|
||||
value: stringAttr(element, kAXValueAttribute as CFString),
|
||||
bounds: bounds(element),
|
||||
actions: actions(element)
|
||||
)
|
||||
}
|
||||
|
||||
func cachedElement(_ params: JSON) -> AXUIElement? {
|
||||
guard let stateId = params["stateId"] as? String,
|
||||
let elementIndex = params["element_index"] as? String
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
return stateAxElements[stateId]?[elementIndex]
|
||||
}
|
||||
|
||||
func dictionary(_ record: ElementRecord) -> JSON {
|
||||
var output: JSON = [
|
||||
"index": record.index,
|
||||
"role": record.role,
|
||||
"actions": record.actions,
|
||||
]
|
||||
if let title = record.title { output["title"] = title }
|
||||
if let value = record.value { output["value"] = value }
|
||||
if let bounds = record.bounds { output["bounds"] = bounds }
|
||||
return output
|
||||
}
|
||||
|
||||
func pruneStoredStates() {
|
||||
while stateOrder.count > maxStoredStates {
|
||||
let evicted = stateOrder.removeFirst()
|
||||
stateElements.removeValue(forKey: evicted)
|
||||
stateAxElements.removeValue(forKey: evicted)
|
||||
}
|
||||
}
|
||||
|
||||
func resolveApp(_ query: String) throws -> NSRunningApplication {
|
||||
let normalized = query.lowercased()
|
||||
let apps = NSWorkspace.shared.runningApplications.filter { app in
|
||||
app.activationPolicy == .regular
|
||||
}
|
||||
if let app = apps.first(where: { $0.bundleIdentifier?.lowercased() == normalized }) {
|
||||
return app
|
||||
}
|
||||
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased() == normalized }) {
|
||||
return app
|
||||
}
|
||||
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased().contains(normalized) }) {
|
||||
return app
|
||||
}
|
||||
throw NSError(domain: "CloudCLISemantics", code: 404, userInfo: [NSLocalizedDescriptionKey: "App is not running: \(query)"])
|
||||
}
|
||||
|
||||
func listApps() -> [[String: Any]] {
|
||||
NSWorkspace.shared.runningApplications
|
||||
.filter { $0.activationPolicy == .regular }
|
||||
.map { app in
|
||||
[
|
||||
"id": app.bundleIdentifier ?? app.localizedName ?? "\(app.processIdentifier)",
|
||||
"name": app.localizedName ?? app.bundleIdentifier ?? "Unknown",
|
||||
"bundleIdentifier": app.bundleIdentifier ?? "",
|
||||
"pid": Int(app.processIdentifier),
|
||||
"running": true,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
func walk(_ element: AXUIElement, depth: Int, maxDepth: Int, records: inout [ElementRecord], axRecords: inout [String: AXUIElement], limit: Int) {
|
||||
if depth > maxDepth || records.count >= limit { return }
|
||||
let index = "\(records.count + 1)"
|
||||
records.append(record(element, index: index))
|
||||
axRecords[index] = element
|
||||
for child in arrayAttr(element, kAXChildrenAttribute as CFString) {
|
||||
walk(child, depth: depth + 1, maxDepth: maxDepth, records: &records, axRecords: &axRecords, limit: limit)
|
||||
if records.count >= limit { return }
|
||||
}
|
||||
}
|
||||
|
||||
func pngDataUrlForMainDisplay() -> String? {
|
||||
let fileURL = URL(fileURLWithPath: NSTemporaryDirectory()).appendingPathComponent("cloudcli-semantics-\(UUID().uuidString).png")
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: "/usr/sbin/screencapture")
|
||||
process.arguments = ["-x", "-t", "png", fileURL.path]
|
||||
|
||||
do {
|
||||
try process.run()
|
||||
process.waitUntilExit()
|
||||
guard process.terminationStatus == 0 else { return nil }
|
||||
let png = try Data(contentsOf: fileURL)
|
||||
try? FileManager.default.removeItem(at: fileURL)
|
||||
return png.isEmpty ? nil : "data:image/png;base64,\(png.base64EncodedString())"
|
||||
} catch {
|
||||
try? FileManager.default.removeItem(at: fileURL)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func getAppState(_ params: JSON) throws -> JSON {
|
||||
let appName = params["app"] as? String ?? ""
|
||||
let app = try resolveApp(appName)
|
||||
let axApp = AXUIElementCreateApplication(app.processIdentifier)
|
||||
let windows = arrayAttr(axApp, kAXWindowsAttribute as CFString)
|
||||
let root = windows.first ?? axApp
|
||||
var records: [ElementRecord] = []
|
||||
var axRecords: [String: AXUIElement] = [:]
|
||||
walk(root, depth: 0, maxDepth: 5, records: &records, axRecords: &axRecords, limit: 300)
|
||||
let stateId = "state_\(UUID().uuidString)"
|
||||
stateElements[stateId] = records
|
||||
stateAxElements[stateId] = axRecords
|
||||
stateOrder.append(stateId)
|
||||
pruneStoredStates()
|
||||
|
||||
let elements = records.map(dictionary)
|
||||
return [
|
||||
"stateId": stateId,
|
||||
"app": app.localizedName ?? app.bundleIdentifier ?? appName,
|
||||
"platform": "darwin",
|
||||
"screenshotDataUrl": pngDataUrlForMainDisplay() ?? NSNull(),
|
||||
"displaySize": [
|
||||
"width": Int(CGDisplayPixelsWide(CGMainDisplayID())),
|
||||
"height": Int(CGDisplayPixelsHigh(CGMainDisplayID())),
|
||||
],
|
||||
"elements": elements,
|
||||
"accessibilityTree": elements,
|
||||
"treeText": elements.map { "\($0["index"] ?? "") \($0["role"] ?? "") \($0["title"] ?? "")" }.joined(separator: "\n"),
|
||||
]
|
||||
}
|
||||
|
||||
func cgMouseButton(_ value: Any?) -> CGMouseButton {
|
||||
guard let button = value as? String else { return .left }
|
||||
switch button {
|
||||
case "right": return .right
|
||||
case "middle": return .center
|
||||
default: return .left
|
||||
}
|
||||
}
|
||||
|
||||
func mouseEventTypes(_ button: CGMouseButton) -> (CGEventType, CGEventType) {
|
||||
switch button {
|
||||
case .right: return (.rightMouseDown, .rightMouseUp)
|
||||
case .center: return (.otherMouseDown, .otherMouseUp)
|
||||
default: return (.leftMouseDown, .leftMouseUp)
|
||||
}
|
||||
}
|
||||
|
||||
func postMouseClick(point: CGPoint, button: CGMouseButton, clickCount: Int = 1) throws {
|
||||
guard let source = CGEventSource(stateID: .combinedSessionState) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
|
||||
}
|
||||
let eventTypes = mouseEventTypes(button)
|
||||
for _ in 0..<max(1, clickCount) {
|
||||
let down = CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: point, mouseButton: button)
|
||||
let up = CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: point, mouseButton: button)
|
||||
down?.post(tap: .cghidEventTap)
|
||||
up?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
}
|
||||
}
|
||||
|
||||
func postDrag(from: CGPoint, to: CGPoint, button: CGMouseButton) throws {
|
||||
guard let source = CGEventSource(stateID: .combinedSessionState) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
|
||||
}
|
||||
let eventTypes = mouseEventTypes(button)
|
||||
CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: from, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
}
|
||||
|
||||
func runAppleScript(_ script: String) throws {
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: "/usr/bin/osascript")
|
||||
process.arguments = ["-e", script]
|
||||
process.standardOutput = Pipe()
|
||||
let stderr = Pipe()
|
||||
process.standardError = stderr
|
||||
try process.run()
|
||||
process.waitUntilExit()
|
||||
if process.terminationStatus != 0 {
|
||||
let data = stderr.fileHandleForReading.readDataToEndOfFile()
|
||||
let message = String(data: data, encoding: .utf8) ?? "AppleScript failed."
|
||||
throw NSError(domain: "CloudCLISemantics", code: Int(process.terminationStatus), userInfo: [NSLocalizedDescriptionKey: message])
|
||||
}
|
||||
}
|
||||
|
||||
func escapedAppleScriptString(_ value: String) -> String {
|
||||
value.replacingOccurrences(of: "\\", with: "\\\\").replacingOccurrences(of: "\"", with: "\\\"")
|
||||
}
|
||||
|
||||
func pointForElement(_ params: JSON) -> CGPoint? {
|
||||
if let x = params["x"] as? Double, let y = params["y"] as? Double {
|
||||
return CGPoint(x: x, y: y)
|
||||
}
|
||||
guard let stateId = params["stateId"] as? String,
|
||||
let elementIndex = params["element_index"] as? String,
|
||||
let element = stateElements[stateId]?.first(where: { $0.index == elementIndex }),
|
||||
let b = element.bounds,
|
||||
let x = b["x"], let y = b["y"], let width = b["width"], let height = b["height"]
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
return CGPoint(x: x + width / 2, y: y + height / 2)
|
||||
}
|
||||
|
||||
func click(_ params: JSON) throws -> JSON {
|
||||
if let element = cachedElement(params),
|
||||
cgMouseButton(params["mouse_button"]) == .left,
|
||||
(params["click_count"] as? Int ?? 1) == 1,
|
||||
actions(element).contains(kAXPressAction as String),
|
||||
AXUIElementPerformAction(element, kAXPressAction as CFString) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "click_element requires x/y or stateId + element_index"])
|
||||
}
|
||||
let clickCount = params["click_count"] as? Int ?? 1
|
||||
try postMouseClick(point: point, button: cgMouseButton(params["mouse_button"]), clickCount: clickCount)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func performSecondaryAction(_ params: JSON) throws -> JSON {
|
||||
if let element = cachedElement(params),
|
||||
actions(element).contains(kAXShowMenuAction as String),
|
||||
AXUIElementPerformAction(element, kAXShowMenuAction as CFString) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "perform_secondary_action requires x/y or stateId + element_index"])
|
||||
}
|
||||
try postMouseClick(point: point, button: .right)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func setValue(_ params: JSON) throws -> JSON {
|
||||
guard let value = params["value"] as? String else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires value"])
|
||||
}
|
||||
if let element = cachedElement(params),
|
||||
AXUIElementSetAttributeValue(element, kAXValueAttribute as CFString, value as CFTypeRef) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires x/y or stateId + element_index"])
|
||||
}
|
||||
try postMouseClick(point: point, button: .left)
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"a\" using command down")
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(value))\"")
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func typeText(_ params: JSON) throws -> JSON {
|
||||
let text = params["text"] as? String ?? ""
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(text))\"")
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func appleScriptModifiers(_ parts: [String]) -> String {
|
||||
let modifiers = parts.compactMap { part -> String? in
|
||||
switch part.lowercased() {
|
||||
case "cmd", "command", "meta": return "command down"
|
||||
case "ctrl", "control": return "control down"
|
||||
case "alt", "option": return "option down"
|
||||
case "shift": return "shift down"
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
return modifiers.isEmpty ? "" : " using {\(modifiers.joined(separator: ", "))}"
|
||||
}
|
||||
|
||||
func appleScriptKeyCode(_ key: String) -> Int? {
|
||||
switch key.lowercased() {
|
||||
case "return", "enter": return 36
|
||||
case "tab": return 48
|
||||
case "space": return 49
|
||||
case "delete", "backspace": return 51
|
||||
case "escape", "esc": return 53
|
||||
case "left": return 123
|
||||
case "right": return 124
|
||||
case "down": return 125
|
||||
case "up": return 126
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
|
||||
func pressKey(_ params: JSON) throws -> JSON {
|
||||
let raw = params["key"] as? String ?? ""
|
||||
let parts = raw.split(separator: "+").map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
|
||||
let key = parts.last ?? raw
|
||||
let modifiers = appleScriptModifiers(Array(parts.dropLast()))
|
||||
if let keyCode = appleScriptKeyCode(key) {
|
||||
try runAppleScript("tell application \"System Events\" to key code \(keyCode)\(modifiers)")
|
||||
} else {
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(key))\"\(modifiers)")
|
||||
}
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func scrollElement(_ params: JSON) throws -> JSON {
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "scroll_element requires x/y or stateId + element_index"])
|
||||
}
|
||||
CGWarpMouseCursorPosition(point)
|
||||
let direction = params["direction"] as? String ?? "down"
|
||||
let pages = params["pages"] as? Double ?? 1.0
|
||||
let amount = Int32(max(1.0, abs(pages) * 8.0))
|
||||
let vertical = direction == "up" ? amount : direction == "down" ? -amount : 0
|
||||
let horizontal = direction == "left" ? amount : direction == "right" ? -amount : 0
|
||||
CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 2, wheel1: vertical, wheel2: horizontal, wheel3: 0)?.post(tap: .cghidEventTap)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func drag(_ params: JSON) throws -> JSON {
|
||||
guard let fromX = params["from_x"] as? Double,
|
||||
let fromY = params["from_y"] as? Double,
|
||||
let toX = params["to_x"] as? Double,
|
||||
let toY = params["to_y"] as? Double
|
||||
else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "drag requires from_x/from_y/to_x/to_y"])
|
||||
}
|
||||
try postDrag(from: CGPoint(x: fromX, y: fromY), to: CGPoint(x: toX, y: toY), button: cgMouseButton(params["mouse_button"]))
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func handle(_ request: JSON) {
|
||||
let id = request["id"]
|
||||
let method = request["method"] as? String ?? ""
|
||||
let params = request["params"] as? JSON ?? [:]
|
||||
|
||||
do {
|
||||
switch method {
|
||||
case "list_apps":
|
||||
respond(id: id, result: listApps())
|
||||
case "get_app_state":
|
||||
respond(id: id, result: try getAppState(params))
|
||||
case "click_element":
|
||||
respond(id: id, result: try click(params))
|
||||
case "perform_secondary_action":
|
||||
respond(id: id, result: try performSecondaryAction(params))
|
||||
case "set_value":
|
||||
respond(id: id, result: try setValue(params))
|
||||
case "type_text":
|
||||
respond(id: id, result: try typeText(params))
|
||||
case "press_key":
|
||||
respond(id: id, result: try pressKey(params))
|
||||
case "scroll_element":
|
||||
respond(id: id, result: try scrollElement(params))
|
||||
case "drag":
|
||||
respond(id: id, result: try drag(params))
|
||||
default:
|
||||
respondError(id: id, "Method is not implemented yet: \(method)")
|
||||
}
|
||||
} catch {
|
||||
respondError(id: id, error.localizedDescription)
|
||||
}
|
||||
}
|
||||
|
||||
while let line = readLine() {
|
||||
guard let data = line.data(using: .utf8),
|
||||
let object = try? JSONSerialization.jsonObject(with: data),
|
||||
let request = object as? JSON
|
||||
else {
|
||||
respondError(id: nil, "Invalid JSON request")
|
||||
continue
|
||||
}
|
||||
handle(request)
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import readline from 'node:readline';
|
||||
|
||||
type JsonRecord = Record<string, unknown>;
|
||||
|
||||
type PendingRequest = {
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (error: Error) => void;
|
||||
timer: ReturnType<typeof setTimeout>;
|
||||
};
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_SEMANTICS_HELPER_TIMEOUT_MS || '60000', 10);
|
||||
|
||||
function timeoutMs(): number {
|
||||
return Number.isFinite(DEFAULT_TIMEOUT_MS) && DEFAULT_TIMEOUT_MS > 0 ? DEFAULT_TIMEOUT_MS : 60000;
|
||||
}
|
||||
|
||||
function errorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
export class SemanticHelperProcess {
|
||||
private child: ChildProcessWithoutNullStreams | null = null;
|
||||
private reader: readline.Interface | null = null;
|
||||
private nextId = 1;
|
||||
private pending = new Map<number, PendingRequest>();
|
||||
|
||||
constructor(private readonly executablePath: string) {}
|
||||
|
||||
async request(method: string, params: JsonRecord): Promise<unknown> {
|
||||
this.ensureStarted();
|
||||
const child = this.child;
|
||||
if (!child?.stdin.writable) {
|
||||
throw new Error('Semantic helper process is not running.');
|
||||
}
|
||||
|
||||
const id = this.nextId++;
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
this.pending.delete(id);
|
||||
reject(new Error(`Semantic helper request timed out: ${method}`));
|
||||
}, timeoutMs());
|
||||
this.pending.set(id, { resolve, reject, timer });
|
||||
child.stdin.write(`${JSON.stringify({ id, method, params })}\n`);
|
||||
});
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
const child = this.child;
|
||||
this.child = null;
|
||||
this.reader?.close();
|
||||
this.reader = null;
|
||||
this.rejectAll('Semantic helper stopped.');
|
||||
if (child) {
|
||||
try { child.kill('SIGTERM'); } catch { /* noop */ }
|
||||
}
|
||||
}
|
||||
|
||||
private ensureStarted(): void {
|
||||
if (this.child) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.child = spawn(this.executablePath, [], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
this.reader = readline.createInterface({ input: this.child.stdout });
|
||||
this.reader.on('line', (line) => this.handleLine(line));
|
||||
|
||||
this.child.stderr.on('data', (chunk) => {
|
||||
const text = String(chunk).trim();
|
||||
if (text) {
|
||||
console.error('[SemanticHelper]', text);
|
||||
}
|
||||
});
|
||||
|
||||
this.child.once('error', (error) => {
|
||||
this.child = null;
|
||||
this.rejectAll(`Failed to start semantic helper: ${error.message}`);
|
||||
});
|
||||
|
||||
this.child.once('exit', (code) => {
|
||||
this.child = null;
|
||||
this.rejectAll(`Semantic helper exited with code ${code ?? 'null'}.`);
|
||||
});
|
||||
}
|
||||
|
||||
private handleLine(line: string): void {
|
||||
let message: JsonRecord;
|
||||
try {
|
||||
message = JSON.parse(line) as JsonRecord;
|
||||
} catch (error) {
|
||||
console.error('[SemanticHelper] Invalid JSON response:', errorMessage(error));
|
||||
return;
|
||||
}
|
||||
|
||||
const id = typeof message.id === 'number' ? message.id : null;
|
||||
if (id === null) {
|
||||
return;
|
||||
}
|
||||
const pending = this.pending.get(id);
|
||||
if (!pending) {
|
||||
return;
|
||||
}
|
||||
clearTimeout(pending.timer);
|
||||
this.pending.delete(id);
|
||||
|
||||
if (message.error) {
|
||||
pending.reject(new Error(typeof message.error === 'string' ? message.error : 'Semantic helper request failed.'));
|
||||
return;
|
||||
}
|
||||
pending.resolve(message.result);
|
||||
}
|
||||
|
||||
private rejectAll(reason: string): void {
|
||||
for (const [id, request] of this.pending.entries()) {
|
||||
clearTimeout(request.timer);
|
||||
request.reject(new Error(reason));
|
||||
this.pending.delete(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
export type SemanticHelperPlatform = 'darwin' | 'win32';
|
||||
|
||||
export type SemanticHelperResolution = {
|
||||
available: boolean;
|
||||
path: string | null;
|
||||
source: 'bundled' | 'dev' | 'missing';
|
||||
platform: NodeJS.Platform;
|
||||
arch: NodeJS.Architecture;
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
function helperExecutableName(platform: NodeJS.Platform): string | null {
|
||||
if (platform === 'darwin') {
|
||||
return 'CloudCLISemantics';
|
||||
}
|
||||
if (platform === 'win32') {
|
||||
return 'CloudCLISemantics.exe';
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function pathExists(filePath: string): boolean {
|
||||
try {
|
||||
fs.accessSync(filePath, fs.constants.X_OK);
|
||||
return true;
|
||||
} catch {
|
||||
try {
|
||||
fs.accessSync(filePath, fs.constants.F_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function candidatePaths(platform: NodeJS.Platform, arch: NodeJS.Architecture): Array<{ source: 'bundled' | 'dev'; path: string }> {
|
||||
const executable = helperExecutableName(platform);
|
||||
if (!executable) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const platformArch = `${platform}-${arch}`;
|
||||
return [
|
||||
{
|
||||
source: 'bundled',
|
||||
path: path.resolve(__dirname, '..', 'bin', platformArch, executable),
|
||||
},
|
||||
{
|
||||
source: 'dev',
|
||||
path: path.resolve(process.cwd(), 'server', 'modules', 'computer-use', 'semantics', 'bin', platformArch, executable),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
export function resolveSemanticHelper(
|
||||
platform: NodeJS.Platform = process.platform,
|
||||
arch: NodeJS.Architecture = process.arch,
|
||||
): SemanticHelperResolution {
|
||||
const executable = helperExecutableName(platform);
|
||||
if (!executable) {
|
||||
return {
|
||||
available: false,
|
||||
path: null,
|
||||
source: 'missing',
|
||||
platform,
|
||||
arch,
|
||||
reason: `Semantic Computer Use helper is not supported on ${platform}.`,
|
||||
};
|
||||
}
|
||||
|
||||
for (const candidate of candidatePaths(platform, arch)) {
|
||||
if (pathExists(candidate.path)) {
|
||||
return {
|
||||
available: true,
|
||||
path: candidate.path,
|
||||
source: candidate.source,
|
||||
platform,
|
||||
arch,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
available: false,
|
||||
path: null,
|
||||
source: 'missing',
|
||||
platform,
|
||||
arch,
|
||||
reason: `Bundled semantic helper was not found for ${platform}-${arch} (${executable}).`,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0-windows</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<UseWindowsForms>true</UseWindowsForms>
|
||||
<UseWPF>true</UseWPF>
|
||||
<AssemblyName>CloudCLISemantics</AssemblyName>
|
||||
</PropertyGroup>
|
||||
</Project>
|
||||
534
server/modules/computer-use/semantics/helpers/windows/Program.cs
Normal file
534
server/modules/computer-use/semantics/helpers/windows/Program.cs
Normal file
@@ -0,0 +1,534 @@
|
||||
using System.Diagnostics;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
using System.IO;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text.Json;
|
||||
using System.Windows.Automation;
|
||||
|
||||
static class Program
|
||||
{
|
||||
private const int MaxStoredStates = 100;
|
||||
private static readonly Dictionary<string, List<ElementRecord>> StateElements = new();
|
||||
private static readonly Dictionary<string, Dictionary<string, AutomationElement>> StateAutomationElements = new();
|
||||
private static readonly Queue<string> StateOrder = new();
|
||||
|
||||
public static void Main()
|
||||
{
|
||||
string? line;
|
||||
while ((line = Console.ReadLine()) != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(line);
|
||||
var root = doc.RootElement;
|
||||
var id = root.TryGetProperty("id", out var idValue) ? idValue.Clone() : default;
|
||||
var method = root.TryGetProperty("method", out var methodValue) ? methodValue.GetString() ?? "" : "";
|
||||
var parameters = root.TryGetProperty("params", out var paramsValue) && paramsValue.ValueKind == JsonValueKind.Object
|
||||
? paramsValue.Clone()
|
||||
: JsonDocument.Parse("{}").RootElement.Clone();
|
||||
|
||||
try
|
||||
{
|
||||
object result = method switch
|
||||
{
|
||||
"list_apps" => ListApps(),
|
||||
"get_app_state" => GetAppState(parameters),
|
||||
"click_element" => ClickElement(parameters),
|
||||
"perform_secondary_action" => PerformSecondaryAction(parameters),
|
||||
"set_value" => SetValue(parameters),
|
||||
"type_text" => TypeText(parameters),
|
||||
"press_key" => PressKey(parameters),
|
||||
"scroll_element" => ScrollElement(parameters),
|
||||
"drag" => Drag(parameters),
|
||||
_ => throw new InvalidOperationException($"Method is not implemented yet: {method}")
|
||||
};
|
||||
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["result"] = result });
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["error"] = ex.Message });
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Write(new Dictionary<string, object?> { ["id"] = null, ["error"] = $"Invalid JSON request: {ex.Message}" });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static object? JsonValue(JsonElement element)
|
||||
{
|
||||
return element.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => element.GetString(),
|
||||
JsonValueKind.Number => element.TryGetInt64(out var number) ? number : element.GetDouble(),
|
||||
JsonValueKind.True => true,
|
||||
JsonValueKind.False => false,
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static void Write(object value)
|
||||
{
|
||||
Console.WriteLine(JsonSerializer.Serialize(value));
|
||||
Console.Out.Flush();
|
||||
}
|
||||
|
||||
private static List<Dictionary<string, object?>> ListApps()
|
||||
{
|
||||
return Process.GetProcesses()
|
||||
.Where(process => process.MainWindowHandle != IntPtr.Zero)
|
||||
.OrderBy(process => process.ProcessName)
|
||||
.Select(process => new Dictionary<string, object?>
|
||||
{
|
||||
["id"] = process.Id.ToString(),
|
||||
["name"] = process.ProcessName,
|
||||
["processName"] = process.ProcessName,
|
||||
["pid"] = process.Id,
|
||||
["running"] = true,
|
||||
["windowTitle"] = process.MainWindowTitle
|
||||
})
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static Process ResolveProcess(string query)
|
||||
{
|
||||
var normalized = query.Trim();
|
||||
if (string.IsNullOrWhiteSpace(normalized))
|
||||
{
|
||||
throw new InvalidOperationException("app is required.");
|
||||
}
|
||||
|
||||
var processes = Process.GetProcesses()
|
||||
.Where(process => process.MainWindowHandle != IntPtr.Zero)
|
||||
.ToList();
|
||||
|
||||
return processes.FirstOrDefault(process => process.ProcessName.Equals(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? processes.FirstOrDefault(process => process.MainWindowTitle.Equals(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? processes.FirstOrDefault(process => process.MainWindowTitle.Contains(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? throw new InvalidOperationException($"App is not running: {query}");
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> GetAppState(JsonElement parameters)
|
||||
{
|
||||
var appQuery = ReadString(parameters, "app");
|
||||
var process = ResolveProcess(appQuery);
|
||||
var root = AutomationElement.FromHandle(process.MainWindowHandle)
|
||||
?? throw new InvalidOperationException("No UI Automation root window is available.");
|
||||
|
||||
var records = new List<ElementRecord>();
|
||||
var automationElements = new Dictionary<string, AutomationElement>();
|
||||
Walk(root, records, automationElements, 0, 5, 300);
|
||||
var stateId = $"state_{Guid.NewGuid()}";
|
||||
StateElements[stateId] = records;
|
||||
StateAutomationElements[stateId] = automationElements;
|
||||
StateOrder.Enqueue(stateId);
|
||||
PruneStoredStates();
|
||||
|
||||
var elements = records.Select(record => record.ToDictionary()).ToList();
|
||||
var bounds = root.Current.BoundingRectangle;
|
||||
return new Dictionary<string, object?>
|
||||
{
|
||||
["stateId"] = stateId,
|
||||
["app"] = process.ProcessName,
|
||||
["platform"] = "win32",
|
||||
["screenshotDataUrl"] = CaptureScreen(),
|
||||
["displaySize"] = new Dictionary<string, object?>
|
||||
{
|
||||
["width"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Width,
|
||||
["height"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Height
|
||||
},
|
||||
["window"] = new Dictionary<string, object?>
|
||||
{
|
||||
["title"] = process.MainWindowTitle,
|
||||
["bounds"] = BoundsDictionary(bounds)
|
||||
},
|
||||
["elements"] = elements,
|
||||
["accessibilityTree"] = elements,
|
||||
["treeText"] = string.Join("\n", elements.Select(element => $"{element["index"]} {element["role"]} {element.GetValueOrDefault("title")}"))
|
||||
};
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ClickElement(JsonElement parameters)
|
||||
{
|
||||
var mouseButton = ReadString(parameters, "mouse_button");
|
||||
if ((mouseButton == "" || mouseButton == "left") && ReadInt(parameters, "click_count", 1) == 1)
|
||||
{
|
||||
var element = AutomationElementFor(parameters);
|
||||
if (element != null && TryInvoke(element))
|
||||
{
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("click_element requires x/y or stateId + element_index.");
|
||||
}
|
||||
|
||||
SendMouseClick(point.Value.X, point.Value.Y, ReadString(parameters, "mouse_button"), ReadInt(parameters, "click_count", 1));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> PerformSecondaryAction(JsonElement parameters)
|
||||
{
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("perform_secondary_action requires x/y or stateId + element_index.");
|
||||
}
|
||||
|
||||
SendMouseClick(point.Value.X, point.Value.Y, "right", 1);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> SetValue(JsonElement parameters)
|
||||
{
|
||||
var value = ReadString(parameters, "value");
|
||||
var element = AutomationElementFor(parameters);
|
||||
var focused = false;
|
||||
if (element != null)
|
||||
{
|
||||
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var valuePattern))
|
||||
{
|
||||
((ValuePattern)valuePattern).SetValue(value);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
element.SetFocus();
|
||||
focused = true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Fall through to coordinate focus below.
|
||||
}
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point != null)
|
||||
{
|
||||
SendMouseClick(point.Value.X, point.Value.Y, "left", 1);
|
||||
focused = true;
|
||||
}
|
||||
else if (!focused && element == null)
|
||||
{
|
||||
throw new InvalidOperationException("set_value requires x/y or stateId + element_index.");
|
||||
}
|
||||
else if (!focused)
|
||||
{
|
||||
throw new InvalidOperationException("set_value could not focus the requested element.");
|
||||
}
|
||||
System.Windows.Forms.SendKeys.SendWait("^a");
|
||||
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(value));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> TypeText(JsonElement parameters)
|
||||
{
|
||||
var text = ReadString(parameters, "text");
|
||||
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(text));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> PressKey(JsonElement parameters)
|
||||
{
|
||||
var key = ReadString(parameters, "key");
|
||||
System.Windows.Forms.SendKeys.SendWait(ToSendKeysChord(key));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ScrollElement(JsonElement parameters)
|
||||
{
|
||||
var element = AutomationElementFor(parameters);
|
||||
var direction = ReadString(parameters, "direction");
|
||||
var pages = ReadDouble(parameters, "pages", 1);
|
||||
if (element != null && element.TryGetCurrentPattern(ScrollPattern.Pattern, out var scrollPatternValue))
|
||||
{
|
||||
var scrollPattern = (ScrollPattern)scrollPatternValue;
|
||||
var vertical = direction == "up" ? ScrollAmount.LargeDecrement : direction == "down" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
|
||||
var horizontal = direction == "left" ? ScrollAmount.LargeDecrement : direction == "right" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
|
||||
scrollPattern.Scroll(horizontal, vertical);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("scroll_element requires x/y or stateId + element_index.");
|
||||
}
|
||||
SetCursorPos(point.Value.X, point.Value.Y);
|
||||
var wheel = (int)Math.Round(Math.Max(1, pages) * 120);
|
||||
if (direction == "down") wheel = -wheel;
|
||||
mouse_event(0x0800, 0, 0, unchecked((uint)wheel), UIntPtr.Zero);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static void PruneStoredStates()
|
||||
{
|
||||
while (StateOrder.Count > MaxStoredStates)
|
||||
{
|
||||
var evicted = StateOrder.Dequeue();
|
||||
StateElements.Remove(evicted);
|
||||
StateAutomationElements.Remove(evicted);
|
||||
}
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> Drag(JsonElement parameters)
|
||||
{
|
||||
var fromX = ReadDouble(parameters, "from_x", double.NaN);
|
||||
var fromY = ReadDouble(parameters, "from_y", double.NaN);
|
||||
var toX = ReadDouble(parameters, "to_x", double.NaN);
|
||||
var toY = ReadDouble(parameters, "to_y", double.NaN);
|
||||
if (double.IsNaN(fromX) || double.IsNaN(fromY) || double.IsNaN(toX) || double.IsNaN(toY))
|
||||
{
|
||||
throw new InvalidOperationException("drag requires from_x/from_y/to_x/to_y.");
|
||||
}
|
||||
|
||||
SetCursorPos((int)Math.Round(fromX), (int)Math.Round(fromY));
|
||||
mouse_event(0x0002, 0, 0, 0, UIntPtr.Zero);
|
||||
Thread.Sleep(80);
|
||||
SetCursorPos((int)Math.Round(toX), (int)Math.Round(toY));
|
||||
Thread.Sleep(80);
|
||||
mouse_event(0x0004, 0, 0, 0, UIntPtr.Zero);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static void Walk(AutomationElement element, List<ElementRecord> records, Dictionary<string, AutomationElement> automationElements, int depth, int maxDepth, int limit)
|
||||
{
|
||||
if (depth > maxDepth || records.Count >= limit) return;
|
||||
var index = (records.Count + 1).ToString();
|
||||
records.Add(ElementRecord.From(element, index));
|
||||
automationElements[index] = element;
|
||||
var children = element.FindAll(TreeScope.Children, Condition.TrueCondition);
|
||||
foreach (AutomationElement child in children)
|
||||
{
|
||||
Walk(child, records, automationElements, depth + 1, maxDepth, limit);
|
||||
if (records.Count >= limit) return;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ReadString(JsonElement element, string property)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.ValueKind == JsonValueKind.String
|
||||
? value.GetString() ?? ""
|
||||
: "";
|
||||
}
|
||||
|
||||
private static int ReadInt(JsonElement element, string property, int defaultValue)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.TryGetInt32(out var number)
|
||||
? number
|
||||
: defaultValue;
|
||||
}
|
||||
|
||||
private static double ReadDouble(JsonElement element, string property, double defaultValue)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.TryGetDouble(out var number)
|
||||
? number
|
||||
: defaultValue;
|
||||
}
|
||||
|
||||
private static AutomationElement? AutomationElementFor(JsonElement parameters)
|
||||
{
|
||||
var stateId = ReadString(parameters, "stateId");
|
||||
var elementIndex = ReadString(parameters, "element_index");
|
||||
return !string.IsNullOrWhiteSpace(stateId)
|
||||
&& !string.IsNullOrWhiteSpace(elementIndex)
|
||||
&& StateAutomationElements.TryGetValue(stateId, out var elements)
|
||||
&& elements.TryGetValue(elementIndex, out var element)
|
||||
? element
|
||||
: null;
|
||||
}
|
||||
|
||||
private static System.Drawing.Point? PointFor(JsonElement parameters)
|
||||
{
|
||||
if (parameters.TryGetProperty("x", out var xValue) && parameters.TryGetProperty("y", out var yValue)
|
||||
&& xValue.TryGetDouble(out var x) && yValue.TryGetDouble(out var y))
|
||||
{
|
||||
return new System.Drawing.Point((int)Math.Round(x), (int)Math.Round(y));
|
||||
}
|
||||
|
||||
var stateId = ReadString(parameters, "stateId");
|
||||
var elementIndex = ReadString(parameters, "element_index");
|
||||
if (string.IsNullOrWhiteSpace(stateId) || string.IsNullOrWhiteSpace(elementIndex)) return null;
|
||||
if (!StateElements.TryGetValue(stateId, out var elements)) return null;
|
||||
var element = elements.FirstOrDefault(item => item.Index == elementIndex);
|
||||
if (element?.Bounds == null) return null;
|
||||
return new System.Drawing.Point(
|
||||
(int)Math.Round(element.Bounds.Value.Left + element.Bounds.Value.Width / 2),
|
||||
(int)Math.Round(element.Bounds.Value.Top + element.Bounds.Value.Height / 2)
|
||||
);
|
||||
}
|
||||
|
||||
private static string CaptureScreen()
|
||||
{
|
||||
var bounds = System.Windows.Forms.Screen.PrimaryScreen!.Bounds;
|
||||
using var bitmap = new Bitmap(bounds.Width, bounds.Height);
|
||||
using var graphics = Graphics.FromImage(bitmap);
|
||||
graphics.CopyFromScreen(bounds.Left, bounds.Top, 0, 0, bounds.Size);
|
||||
using var stream = new MemoryStream();
|
||||
bitmap.Save(stream, ImageFormat.Png);
|
||||
return $"data:image/png;base64,{Convert.ToBase64String(stream.ToArray())}";
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> BoundsDictionary(System.Windows.Rect rect)
|
||||
{
|
||||
return new Dictionary<string, object?>
|
||||
{
|
||||
["x"] = rect.X,
|
||||
["y"] = rect.Y,
|
||||
["width"] = rect.Width,
|
||||
["height"] = rect.Height
|
||||
};
|
||||
}
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
private static extern bool SetCursorPos(int x, int y);
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
private static extern void mouse_event(uint dwFlags, uint dx, uint dy, uint dwData, UIntPtr dwExtraInfo);
|
||||
|
||||
private static void SendMouseClick(int x, int y, string button, int clickCount)
|
||||
{
|
||||
var (down, up) = button switch
|
||||
{
|
||||
"right" => (0x0008u, 0x0010u),
|
||||
"middle" => (0x0020u, 0x0040u),
|
||||
_ => (0x0002u, 0x0004u)
|
||||
};
|
||||
SetCursorPos(x, y);
|
||||
for (var i = 0; i < Math.Max(1, clickCount); i++)
|
||||
{
|
||||
mouse_event(down, 0, 0, 0, UIntPtr.Zero);
|
||||
mouse_event(up, 0, 0, 0, UIntPtr.Zero);
|
||||
Thread.Sleep(80);
|
||||
}
|
||||
}
|
||||
|
||||
private static bool TryInvoke(AutomationElement element)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (!element.TryGetCurrentPattern(InvokePattern.Pattern, out var pattern)) return false;
|
||||
((InvokePattern)pattern).Invoke();
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static string EscapeSendKeys(string value)
|
||||
{
|
||||
return value
|
||||
.Replace("{", "{{}")
|
||||
.Replace("}", "{}}")
|
||||
.Replace("+", "{+}")
|
||||
.Replace("^", "{^}")
|
||||
.Replace("%", "{%}")
|
||||
.Replace("~", "{~}")
|
||||
.Replace("(", "{(}")
|
||||
.Replace(")", "{)}")
|
||||
.Replace("[", "{[}")
|
||||
.Replace("]", "{]}");
|
||||
}
|
||||
|
||||
private static string ToSendKeysChord(string key)
|
||||
{
|
||||
var normalized = key.Trim();
|
||||
if (normalized.Contains('+'))
|
||||
{
|
||||
var parts = normalized.Split('+', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
var modifiers = "";
|
||||
var last = parts.LastOrDefault() ?? "";
|
||||
foreach (var part in parts.Take(parts.Length - 1))
|
||||
{
|
||||
modifiers += part.ToLowerInvariant() switch
|
||||
{
|
||||
"ctrl" or "control" => "^",
|
||||
"alt" => "%",
|
||||
"shift" => "+",
|
||||
"cmd" or "win" or "windows" => "^",
|
||||
_ => ""
|
||||
};
|
||||
}
|
||||
return modifiers + SendKeyName(last);
|
||||
}
|
||||
return SendKeyName(normalized);
|
||||
}
|
||||
|
||||
private static string SendKeyName(string key)
|
||||
{
|
||||
return key.ToLowerInvariant() switch
|
||||
{
|
||||
"return" or "enter" => "{ENTER}",
|
||||
"escape" or "esc" => "{ESC}",
|
||||
"tab" => "{TAB}",
|
||||
"backspace" => "{BACKSPACE}",
|
||||
"delete" or "del" => "{DELETE}",
|
||||
"left" => "{LEFT}",
|
||||
"right" => "{RIGHT}",
|
||||
"up" => "{UP}",
|
||||
"down" => "{DOWN}",
|
||||
"space" => " ",
|
||||
_ => key.Length == 1 ? EscapeSendKeys(key) : $"{{{key.ToUpperInvariant()}}}"
|
||||
};
|
||||
}
|
||||
|
||||
private sealed record ElementRecord(
|
||||
string Index,
|
||||
string Role,
|
||||
string? Title,
|
||||
string? Value,
|
||||
System.Windows.Rect? Bounds,
|
||||
List<string> Actions)
|
||||
{
|
||||
public static ElementRecord From(AutomationElement element, string index)
|
||||
{
|
||||
var patterns = element.GetSupportedPatterns().Select(pattern => pattern.ProgrammaticName).ToList();
|
||||
return new ElementRecord(
|
||||
index,
|
||||
element.Current.ControlType.ProgrammaticName.Replace("ControlType.", ""),
|
||||
element.Current.Name,
|
||||
TryValue(element),
|
||||
element.Current.BoundingRectangle,
|
||||
patterns
|
||||
);
|
||||
}
|
||||
|
||||
public Dictionary<string, object?> ToDictionary()
|
||||
{
|
||||
var output = new Dictionary<string, object?>
|
||||
{
|
||||
["index"] = Index,
|
||||
["role"] = Role,
|
||||
["actions"] = Actions
|
||||
};
|
||||
if (!string.IsNullOrEmpty(Title)) output["title"] = Title;
|
||||
if (!string.IsNullOrEmpty(Value)) output["value"] = Value;
|
||||
if (Bounds != null) output["bounds"] = BoundsDictionary(Bounds.Value);
|
||||
return output;
|
||||
}
|
||||
|
||||
private static string? TryValue(AutomationElement element)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var pattern))
|
||||
{
|
||||
return ((ValuePattern)pattern).Current.Value;
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
const DEFAULT_STATE_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_SEMANTIC_STATE_TTL_MS || String(10 * 60 * 1000), 10);
|
||||
|
||||
type StoredState = {
|
||||
sessionId: string;
|
||||
appKey: string;
|
||||
state: SemanticAppState;
|
||||
updatedAt: number;
|
||||
};
|
||||
|
||||
function normalizeAppKey(app: string): string {
|
||||
return app.trim().toLowerCase();
|
||||
}
|
||||
|
||||
export class SemanticSessionStore {
|
||||
private states = new Map<string, StoredState>();
|
||||
private latestBySessionApp = new Map<string, string>();
|
||||
|
||||
createStateId(): string {
|
||||
return `state_${randomUUID()}`;
|
||||
}
|
||||
|
||||
save(sessionId: string, state: SemanticAppState): SemanticAppState {
|
||||
const appKey = normalizeAppKey(state.app);
|
||||
const nextState = {
|
||||
...state,
|
||||
stateId: state.stateId || this.createStateId(),
|
||||
};
|
||||
this.states.set(nextState.stateId, {
|
||||
sessionId,
|
||||
appKey,
|
||||
state: nextState,
|
||||
updatedAt: Date.now(),
|
||||
});
|
||||
this.latestBySessionApp.set(this.latestKey(sessionId, appKey), nextState.stateId);
|
||||
return nextState;
|
||||
}
|
||||
|
||||
getState(sessionId: string, app: string, stateId?: string): SemanticAppState | null {
|
||||
this.expire();
|
||||
if (stateId) {
|
||||
const entry = this.states.get(stateId);
|
||||
const appKey = normalizeAppKey(app);
|
||||
return entry && entry.sessionId === sessionId && entry.appKey === appKey ? entry.state : null;
|
||||
}
|
||||
const latestStateId = this.latestBySessionApp.get(this.latestKey(sessionId, normalizeAppKey(app)));
|
||||
return latestStateId ? this.states.get(latestStateId)?.state || null : null;
|
||||
}
|
||||
|
||||
getElement(sessionId: string, app: string, elementIndex: string, stateId?: string): SemanticElement | null {
|
||||
const state = this.getState(sessionId, app, stateId);
|
||||
return state?.elements.find((element) => element.index === elementIndex) || null;
|
||||
}
|
||||
|
||||
clearSession(sessionId: string): void {
|
||||
for (const [stateId, entry] of this.states.entries()) {
|
||||
if (entry.sessionId === sessionId) {
|
||||
this.states.delete(stateId);
|
||||
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
expire(now = Date.now()): void {
|
||||
const ttl = Number.isFinite(DEFAULT_STATE_TTL_MS) && DEFAULT_STATE_TTL_MS > 0
|
||||
? DEFAULT_STATE_TTL_MS
|
||||
: 10 * 60 * 1000;
|
||||
for (const [stateId, entry] of this.states.entries()) {
|
||||
if (now - entry.updatedAt > ttl) {
|
||||
this.states.delete(stateId);
|
||||
const key = this.latestKey(entry.sessionId, entry.appKey);
|
||||
if (this.latestBySessionApp.get(key) === stateId) {
|
||||
this.latestBySessionApp.delete(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private latestKey(sessionId: string, appKey: string): string {
|
||||
return `${sessionId}:${appKey}`;
|
||||
}
|
||||
}
|
||||
|
||||
export const semanticSessionStore = new SemanticSessionStore();
|
||||
@@ -0,0 +1,17 @@
|
||||
export const semanticMcpToolMap: Record<string, string> = {
|
||||
computer_app_drag: 'drag',
|
||||
computer_click_element: 'click_element',
|
||||
computer_get_app_state: 'get_app_state',
|
||||
computer_list_apps: 'list_apps',
|
||||
computer_perform_secondary_action: 'perform_secondary_action',
|
||||
computer_press_key: 'press_key',
|
||||
computer_scroll_element: 'scroll_element',
|
||||
computer_set_value: 'set_value',
|
||||
computer_type_text: 'type_text',
|
||||
};
|
||||
|
||||
export const semanticOperationNames = new Set(Object.values(semanticMcpToolMap));
|
||||
|
||||
export function semanticOperationForMcpTool(toolName: string): string | null {
|
||||
return semanticMcpToolMap[toolName] || null;
|
||||
}
|
||||
58
server/modules/computer-use/semantics/semantic-types.ts
Normal file
58
server/modules/computer-use/semantics/semantic-types.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import type { DisplaySize, Point } from '@/modules/computer-use/computer-executor.js';
|
||||
|
||||
export type SemanticBounds = {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
export type SemanticApp = {
|
||||
id?: string;
|
||||
name: string;
|
||||
bundleIdentifier?: string;
|
||||
processName?: string;
|
||||
pid?: number;
|
||||
running: boolean;
|
||||
windowTitle?: string;
|
||||
};
|
||||
|
||||
export type SemanticElement = {
|
||||
index: string;
|
||||
role: string;
|
||||
title?: string;
|
||||
value?: string;
|
||||
description?: string;
|
||||
enabled?: boolean;
|
||||
focused?: boolean;
|
||||
selected?: boolean;
|
||||
bounds?: SemanticBounds;
|
||||
actions?: string[];
|
||||
settableValue?: boolean;
|
||||
};
|
||||
|
||||
export type SemanticAppState = {
|
||||
stateId: string;
|
||||
app: string;
|
||||
platform: NodeJS.Platform;
|
||||
screenshotDataUrl: string | null;
|
||||
displaySize: DisplaySize | null;
|
||||
elements: SemanticElement[];
|
||||
accessibilityTree: SemanticElement[];
|
||||
treeText?: string;
|
||||
message?: string;
|
||||
};
|
||||
|
||||
export type SemanticToolInput = Record<string, unknown> & {
|
||||
sessionId?: string;
|
||||
app?: string;
|
||||
stateId?: string;
|
||||
element_index?: string;
|
||||
};
|
||||
|
||||
export type SemanticToolResult = SemanticAppState | {
|
||||
apps: SemanticApp[];
|
||||
platform: NodeJS.Platform;
|
||||
};
|
||||
|
||||
export type SemanticActionPoint = Point;
|
||||
@@ -4,6 +4,7 @@ export { apiKeysDb } from '@/modules/database/repositories/api-keys.js';
|
||||
export { appConfigDb } from '@/modules/database/repositories/app-config.js';
|
||||
export { credentialsDb } from '@/modules/database/repositories/credentials.js';
|
||||
export { githubTokensDb } from '@/modules/database/repositories/github-tokens.js';
|
||||
export { notificationChannelEndpointsDb } from '@/modules/database/repositories/notification-channel-endpoints.js';
|
||||
export { notificationPreferencesDb } from '@/modules/database/repositories/notification-preferences.js';
|
||||
export { projectsDb } from '@/modules/database/repositories/projects.db.js';
|
||||
export { pushSubscriptionsDb } from '@/modules/database/repositories/push-subscriptions.js';
|
||||
|
||||
@@ -3,6 +3,7 @@ import { Database } from 'better-sqlite3';
|
||||
import {
|
||||
APP_CONFIG_TABLE_SCHEMA_SQL,
|
||||
LAST_SCANNED_AT_SQL,
|
||||
NOTIFICATION_CHANNEL_ENDPOINTS_TABLE_SCHEMA_SQL,
|
||||
PROJECTS_TABLE_SCHEMA_SQL,
|
||||
PUSH_SUBSCRIPTIONS_TABLE_SCHEMA_SQL,
|
||||
SESSIONS_TABLE_SCHEMA_SQL,
|
||||
@@ -440,6 +441,9 @@ export const runMigrations = (db: Database) => {
|
||||
db.exec(VAPID_KEYS_TABLE_SCHEMA_SQL);
|
||||
db.exec(PUSH_SUBSCRIPTIONS_TABLE_SCHEMA_SQL);
|
||||
db.exec('CREATE INDEX IF NOT EXISTS idx_push_subscriptions_user_id ON push_subscriptions(user_id)');
|
||||
db.exec(NOTIFICATION_CHANNEL_ENDPOINTS_TABLE_SCHEMA_SQL);
|
||||
db.exec('CREATE INDEX IF NOT EXISTS idx_notification_channel_endpoints_user_channel ON notification_channel_endpoints(user_id, channel)');
|
||||
db.exec('CREATE INDEX IF NOT EXISTS idx_notification_channel_endpoints_enabled ON notification_channel_endpoints(enabled)');
|
||||
|
||||
db.exec(PROJECTS_TABLE_SCHEMA_SQL);
|
||||
rebuildProjectsTableWithPrimaryKeySchema(db);
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
import { getConnection } from '@/modules/database/connection.js';
|
||||
|
||||
type NotificationChannelEndpointRow = {
|
||||
id: number;
|
||||
user_id: number;
|
||||
channel: string;
|
||||
endpoint_id: string;
|
||||
label: string | null;
|
||||
metadata_json: string | null;
|
||||
enabled: number;
|
||||
last_seen_at: string;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
};
|
||||
|
||||
type UpsertNotificationChannelEndpointInput = {
|
||||
userId: number;
|
||||
channel: string;
|
||||
endpointId: string;
|
||||
label?: string | null;
|
||||
metadata?: Record<string, unknown> | null;
|
||||
enabled?: boolean;
|
||||
};
|
||||
|
||||
function normalizeRequiredText(value: unknown): string {
|
||||
if (typeof value !== 'string') return '';
|
||||
return value.trim();
|
||||
}
|
||||
|
||||
function normalizeNullableText(value: unknown): string | null {
|
||||
if (typeof value !== 'string') return null;
|
||||
const normalized = value.trim();
|
||||
return normalized || null;
|
||||
}
|
||||
|
||||
function serializeMetadata(metadata: Record<string, unknown> | null | undefined): string | null {
|
||||
if (!metadata || typeof metadata !== 'object') return null;
|
||||
return JSON.stringify(metadata);
|
||||
}
|
||||
|
||||
function parseMetadata(metadataJson: string | null): Record<string, unknown> {
|
||||
if (!metadataJson) return {};
|
||||
try {
|
||||
const parsed = JSON.parse(metadataJson);
|
||||
return parsed && typeof parsed === 'object' ? parsed as Record<string, unknown> : {};
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
export const notificationChannelEndpointsDb = {
|
||||
upsertEndpoint(input: UpsertNotificationChannelEndpointInput): NotificationChannelEndpointRow {
|
||||
const channel = normalizeRequiredText(input.channel);
|
||||
const endpointId = normalizeRequiredText(input.endpointId);
|
||||
if (!channel) throw new Error('channel is required');
|
||||
if (!endpointId) throw new Error('endpointId is required');
|
||||
|
||||
const enabled = input.enabled === false ? 0 : 1;
|
||||
const db = getConnection();
|
||||
db.prepare(
|
||||
`INSERT INTO notification_channel_endpoints (
|
||||
user_id,
|
||||
channel,
|
||||
endpoint_id,
|
||||
label,
|
||||
metadata_json,
|
||||
enabled,
|
||||
last_seen_at,
|
||||
updated_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(user_id, channel, endpoint_id) DO UPDATE SET
|
||||
label = excluded.label,
|
||||
metadata_json = excluded.metadata_json,
|
||||
enabled = excluded.enabled,
|
||||
last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP`
|
||||
).run(
|
||||
input.userId,
|
||||
channel,
|
||||
endpointId,
|
||||
normalizeNullableText(input.label),
|
||||
serializeMetadata(input.metadata),
|
||||
enabled
|
||||
);
|
||||
|
||||
return notificationChannelEndpointsDb.getEndpoint(input.userId, channel, endpointId)!;
|
||||
},
|
||||
|
||||
getEndpoint(userId: number, channel: string, endpointId: string): NotificationChannelEndpointRow | null {
|
||||
const db = getConnection();
|
||||
const row = db.prepare(
|
||||
`SELECT id, user_id, channel, endpoint_id, label, metadata_json, enabled, last_seen_at, created_at, updated_at
|
||||
FROM notification_channel_endpoints
|
||||
WHERE user_id = ? AND channel = ? AND endpoint_id = ?`
|
||||
).get(
|
||||
userId,
|
||||
normalizeRequiredText(channel),
|
||||
normalizeRequiredText(endpointId)
|
||||
) as NotificationChannelEndpointRow | undefined;
|
||||
return row || null;
|
||||
},
|
||||
|
||||
getEndpoints(userId: number, channel: string): NotificationChannelEndpointRow[] {
|
||||
const db = getConnection();
|
||||
return db.prepare(
|
||||
`SELECT id, user_id, channel, endpoint_id, label, metadata_json, enabled, last_seen_at, created_at, updated_at
|
||||
FROM notification_channel_endpoints
|
||||
WHERE user_id = ? AND channel = ?
|
||||
ORDER BY last_seen_at DESC`
|
||||
).all(userId, normalizeRequiredText(channel)) as NotificationChannelEndpointRow[];
|
||||
},
|
||||
|
||||
getEnabledEndpoints(userId: number, channel: string): NotificationChannelEndpointRow[] {
|
||||
const db = getConnection();
|
||||
return db.prepare(
|
||||
`SELECT id, user_id, channel, endpoint_id, label, metadata_json, enabled, last_seen_at, created_at, updated_at
|
||||
FROM notification_channel_endpoints
|
||||
WHERE user_id = ? AND channel = ? AND enabled = 1
|
||||
ORDER BY last_seen_at DESC`
|
||||
).all(userId, normalizeRequiredText(channel)) as NotificationChannelEndpointRow[];
|
||||
},
|
||||
|
||||
setEndpointEnabled(userId: number, channel: string, endpointId: string, enabled: boolean): boolean {
|
||||
const db = getConnection();
|
||||
const result = db.prepare(
|
||||
`UPDATE notification_channel_endpoints
|
||||
SET enabled = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE user_id = ? AND channel = ? AND endpoint_id = ?`
|
||||
).run(enabled ? 1 : 0, userId, normalizeRequiredText(channel), normalizeRequiredText(endpointId));
|
||||
return result.changes > 0;
|
||||
},
|
||||
|
||||
touchEndpoint(userId: number, channel: string, endpointId: string): boolean {
|
||||
const db = getConnection();
|
||||
const result = db.prepare(
|
||||
`UPDATE notification_channel_endpoints
|
||||
SET last_seen_at = CURRENT_TIMESTAMP
|
||||
WHERE user_id = ? AND channel = ? AND endpoint_id = ?`
|
||||
).run(userId, normalizeRequiredText(channel), normalizeRequiredText(endpointId));
|
||||
return result.changes > 0;
|
||||
},
|
||||
|
||||
removeEndpoint(userId: number, channel: string, endpointId: string): boolean {
|
||||
const db = getConnection();
|
||||
const result = db.prepare(
|
||||
'DELETE FROM notification_channel_endpoints WHERE user_id = ? AND channel = ? AND endpoint_id = ?'
|
||||
).run(userId, normalizeRequiredText(channel), normalizeRequiredText(endpointId));
|
||||
return result.changes > 0;
|
||||
},
|
||||
|
||||
parseMetadata,
|
||||
};
|
||||
@@ -10,7 +10,9 @@ type NotificationPreferences = {
|
||||
channels: {
|
||||
inApp: boolean;
|
||||
webPush: boolean;
|
||||
desktop: boolean;
|
||||
sound: boolean;
|
||||
[key: string]: boolean;
|
||||
};
|
||||
events: {
|
||||
actionRequired: boolean;
|
||||
@@ -23,6 +25,7 @@ const DEFAULT_NOTIFICATION_PREFERENCES: NotificationPreferences = {
|
||||
channels: {
|
||||
inApp: false,
|
||||
webPush: false,
|
||||
desktop: false,
|
||||
sound: true,
|
||||
},
|
||||
events: {
|
||||
@@ -34,11 +37,20 @@ const DEFAULT_NOTIFICATION_PREFERENCES: NotificationPreferences = {
|
||||
|
||||
function normalizeNotificationPreferences(value: unknown): NotificationPreferences {
|
||||
const source = value && typeof value === 'object' ? (value as Record<string, any>) : {};
|
||||
const sourceChannels = source.channels && typeof source.channels === 'object'
|
||||
? source.channels as Record<string, unknown>
|
||||
: {};
|
||||
const extraChannels = Object.fromEntries(
|
||||
Object.entries(sourceChannels)
|
||||
.filter(([key, channelValue]) => !['inApp', 'webPush', 'desktop', 'sound'].includes(key) && typeof channelValue === 'boolean')
|
||||
) as Record<string, boolean>;
|
||||
|
||||
return {
|
||||
channels: {
|
||||
...extraChannels,
|
||||
inApp: source.channels?.inApp === true,
|
||||
webPush: source.channels?.webPush === true,
|
||||
desktop: source.channels?.desktop === true,
|
||||
sound: source.channels?.sound !== false,
|
||||
},
|
||||
events: {
|
||||
@@ -103,4 +115,3 @@ export const notificationPreferencesDb = {
|
||||
return notificationPreferencesDb.updateNotificationPreferences(userId, preferences);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -69,6 +69,23 @@ CREATE TABLE IF NOT EXISTS push_subscriptions (
|
||||
);
|
||||
`;
|
||||
|
||||
export const NOTIFICATION_CHANNEL_ENDPOINTS_TABLE_SCHEMA_SQL = `
|
||||
CREATE TABLE IF NOT EXISTS notification_channel_endpoints (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id INTEGER NOT NULL,
|
||||
channel TEXT NOT NULL,
|
||||
endpoint_id TEXT NOT NULL,
|
||||
label TEXT,
|
||||
metadata_json TEXT,
|
||||
enabled BOOLEAN DEFAULT 1,
|
||||
last_seen_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(user_id, channel, endpoint_id),
|
||||
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
|
||||
);
|
||||
`;
|
||||
|
||||
export const PROJECTS_TABLE_SCHEMA_SQL = `
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
project_id TEXT PRIMARY KEY NOT NULL,
|
||||
@@ -144,6 +161,10 @@ ${VAPID_KEYS_TABLE_SCHEMA_SQL}
|
||||
${PUSH_SUBSCRIPTIONS_TABLE_SCHEMA_SQL}
|
||||
CREATE INDEX IF NOT EXISTS idx_push_subscriptions_user_id ON push_subscriptions(user_id);
|
||||
|
||||
${NOTIFICATION_CHANNEL_ENDPOINTS_TABLE_SCHEMA_SQL}
|
||||
CREATE INDEX IF NOT EXISTS idx_notification_channel_endpoints_user_channel ON notification_channel_endpoints(user_id, channel);
|
||||
CREATE INDEX IF NOT EXISTS idx_notification_channel_endpoints_enabled ON notification_channel_endpoints(enabled);
|
||||
|
||||
${PROJECTS_TABLE_SCHEMA_SQL}
|
||||
-- NOTE: These indexes are created in migrations after legacy table-shape repairs.
|
||||
-- Creating them here can fail on upgraded installs where projects lacks those columns.
|
||||
|
||||
13
server/modules/notifications/index.ts
Normal file
13
server/modules/notifications/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
export {
|
||||
buildNotificationPayload,
|
||||
createNotificationEvent,
|
||||
notifyUserIfEnabled,
|
||||
notifyRunFailed,
|
||||
notifyRunStopped,
|
||||
} from '@/modules/notifications/services/notification-orchestrator.service.js';
|
||||
export {
|
||||
registerDesktopNotificationClient,
|
||||
sendDesktopNotification,
|
||||
unregisterDesktopNotificationClient,
|
||||
} from '@/modules/notifications/services/desktop-notification-clients.service.js';
|
||||
export { handleDesktopNotificationsConnection } from '@/modules/notifications/websocket/desktop-notifications-websocket.service.js';
|
||||
127
server/modules/notifications/notifications.routes.ts
Normal file
127
server/modules/notifications/notifications.routes.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
import express from 'express';
|
||||
|
||||
import { notificationChannelEndpointsDb, notificationPreferencesDb } from '@/modules/database/index.js';
|
||||
|
||||
const router = express.Router();
|
||||
|
||||
function readText(value: unknown): string {
|
||||
return typeof value === 'string' ? value.trim() : '';
|
||||
}
|
||||
|
||||
function sanitizeEndpoint(endpoint: any) {
|
||||
return {
|
||||
id: endpoint.id,
|
||||
channel: endpoint.channel,
|
||||
endpointId: endpoint.endpoint_id,
|
||||
label: endpoint.label,
|
||||
metadata: notificationChannelEndpointsDb.parseMetadata(endpoint.metadata_json),
|
||||
enabled: Boolean(endpoint.enabled),
|
||||
lastSeenAt: endpoint.last_seen_at,
|
||||
createdAt: endpoint.created_at,
|
||||
updatedAt: endpoint.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
function readUserId(req: express.Request): number {
|
||||
const userId = Number((req as any).user?.id);
|
||||
if (!Number.isInteger(userId) || userId <= 0) {
|
||||
throw new Error('Authenticated user is missing');
|
||||
}
|
||||
return userId;
|
||||
}
|
||||
|
||||
function updateChannelPreference(userId: number, channel: string): unknown {
|
||||
const currentPrefs = notificationPreferencesDb.getPreferences(userId);
|
||||
const hasEnabledEndpoint = notificationChannelEndpointsDb.getEnabledEndpoints(userId, channel).length > 0;
|
||||
return notificationPreferencesDb.updatePreferences(userId, {
|
||||
...currentPrefs,
|
||||
channels: { ...currentPrefs.channels, [channel]: hasEnabledEndpoint },
|
||||
});
|
||||
}
|
||||
|
||||
router.get('/endpoints', (req, res) => {
|
||||
try {
|
||||
const channel = readText(req.query.channel);
|
||||
if (!channel) {
|
||||
return res.status(400).json({ error: 'channel is required' });
|
||||
}
|
||||
|
||||
const userId = readUserId(req);
|
||||
const endpoints = notificationChannelEndpointsDb
|
||||
.getEndpoints(userId, channel)
|
||||
.map(sanitizeEndpoint);
|
||||
return res.json({ success: true, endpoints });
|
||||
} catch (error) {
|
||||
console.error('Error fetching notification endpoints:', error);
|
||||
return res.status(500).json({ error: 'Failed to fetch notification endpoints' });
|
||||
}
|
||||
});
|
||||
|
||||
router.post('/endpoints/current', (req, res) => {
|
||||
try {
|
||||
const { channel, endpointId, label, metadata = {}, enabled = true } = req.body || {};
|
||||
const normalizedChannel = readText(channel);
|
||||
const normalizedEndpointId = readText(endpointId);
|
||||
if (!normalizedChannel || !normalizedEndpointId) {
|
||||
return res.status(400).json({ error: 'channel and endpointId are required' });
|
||||
}
|
||||
|
||||
const userId = readUserId(req);
|
||||
const endpoint = notificationChannelEndpointsDb.upsertEndpoint({
|
||||
userId,
|
||||
channel: normalizedChannel,
|
||||
endpointId: normalizedEndpointId,
|
||||
label,
|
||||
metadata: metadata && typeof metadata === 'object' ? metadata : {},
|
||||
enabled: enabled !== false,
|
||||
});
|
||||
|
||||
const preferences = updateChannelPreference(userId, normalizedChannel);
|
||||
return res.json({ success: true, endpoint: sanitizeEndpoint(endpoint), preferences });
|
||||
} catch (error) {
|
||||
console.error('Error registering notification endpoint:', error);
|
||||
return res.status(500).json({ error: 'Failed to register notification endpoint' });
|
||||
}
|
||||
});
|
||||
|
||||
router.patch('/endpoints/:channel/:endpointId', (req, res) => {
|
||||
try {
|
||||
const { channel, endpointId } = req.params;
|
||||
const { enabled } = req.body || {};
|
||||
if (typeof enabled !== 'boolean') {
|
||||
return res.status(400).json({ error: 'enabled must be a boolean' });
|
||||
}
|
||||
|
||||
const userId = readUserId(req);
|
||||
const updated = notificationChannelEndpointsDb.setEndpointEnabled(userId, channel, endpointId, enabled);
|
||||
if (!updated) {
|
||||
return res.status(404).json({ error: 'Notification endpoint not found' });
|
||||
}
|
||||
|
||||
const endpoint = notificationChannelEndpointsDb.getEndpoint(userId, channel, endpointId);
|
||||
const preferences = updateChannelPreference(userId, channel);
|
||||
return res.json({ success: true, endpoint: endpoint ? sanitizeEndpoint(endpoint) : null, preferences });
|
||||
} catch (error) {
|
||||
console.error('Error updating notification endpoint:', error);
|
||||
return res.status(500).json({ error: 'Failed to update notification endpoint' });
|
||||
}
|
||||
});
|
||||
|
||||
router.delete('/endpoints/:channel/:endpointId', (req, res) => {
|
||||
try {
|
||||
const { channel, endpointId } = req.params;
|
||||
const userId = readUserId(req);
|
||||
const removed = notificationChannelEndpointsDb.removeEndpoint(userId, channel, endpointId);
|
||||
if (!removed) {
|
||||
return res.status(404).json({ error: 'Notification endpoint not found' });
|
||||
}
|
||||
|
||||
const preferences = updateChannelPreference(userId, channel);
|
||||
return res.json({ success: true, preferences });
|
||||
} catch (error) {
|
||||
console.error('Error removing notification endpoint:', error);
|
||||
return res.status(500).json({ error: 'Failed to remove notification endpoint' });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
@@ -0,0 +1,124 @@
|
||||
import type { WebSocket } from 'ws';
|
||||
|
||||
import { notificationChannelEndpointsDb } from '@/modules/database/index.js';
|
||||
|
||||
const DESKTOP_CHANNEL = 'desktop';
|
||||
|
||||
const clientsByUserId = new Map<number, Map<string, WebSocket>>();
|
||||
const clientBySocket = new WeakMap<WebSocket, { userId: number; endpointId: string }>();
|
||||
|
||||
function normalizeUserId(userId: unknown): number | null {
|
||||
const numeric = Number(userId);
|
||||
return Number.isInteger(numeric) && numeric > 0 ? numeric : null;
|
||||
}
|
||||
|
||||
function normalizeEndpointId(endpointId: unknown): string {
|
||||
if (typeof endpointId !== 'string') return '';
|
||||
return endpointId.trim();
|
||||
}
|
||||
|
||||
function getUserClients(userId: unknown, create = false): Map<string, WebSocket> | null {
|
||||
const normalizedUserId = normalizeUserId(userId);
|
||||
if (!normalizedUserId) return null;
|
||||
let clients = clientsByUserId.get(normalizedUserId);
|
||||
if (!clients && create) {
|
||||
clients = new Map();
|
||||
clientsByUserId.set(normalizedUserId, clients);
|
||||
}
|
||||
return clients || null;
|
||||
}
|
||||
|
||||
export function registerDesktopNotificationClient({
|
||||
userId,
|
||||
deviceId,
|
||||
label = null,
|
||||
platform = null,
|
||||
appVersion = null,
|
||||
ws,
|
||||
}: {
|
||||
userId: number;
|
||||
deviceId: string;
|
||||
label?: string | null;
|
||||
platform?: string | null;
|
||||
appVersion?: string | null;
|
||||
ws: WebSocket;
|
||||
}) {
|
||||
const normalizedUserId = normalizeUserId(userId);
|
||||
const endpointId = normalizeEndpointId(deviceId);
|
||||
if (!normalizedUserId || !endpointId) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const endpoint = notificationChannelEndpointsDb.upsertEndpoint({
|
||||
userId: normalizedUserId,
|
||||
channel: DESKTOP_CHANNEL,
|
||||
endpointId,
|
||||
label,
|
||||
metadata: { platform, appVersion },
|
||||
enabled: true,
|
||||
});
|
||||
|
||||
const clients = getUserClients(normalizedUserId, true)!;
|
||||
const previous = clients.get(endpointId);
|
||||
if (previous && previous !== ws && previous.readyState === previous.OPEN) {
|
||||
previous.close(4000, 'Device reconnected');
|
||||
}
|
||||
|
||||
clients.set(endpointId, ws);
|
||||
clientBySocket.set(ws, { userId: normalizedUserId, endpointId });
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
export function unregisterDesktopNotificationClient(ws: WebSocket): void {
|
||||
const registration = clientBySocket.get(ws);
|
||||
if (!registration) return;
|
||||
|
||||
const clients = getUserClients(registration.userId);
|
||||
if (clients?.get(registration.endpointId) === ws) {
|
||||
clients.delete(registration.endpointId);
|
||||
if (clients.size === 0) {
|
||||
clientsByUserId.delete(registration.userId);
|
||||
}
|
||||
}
|
||||
clientBySocket.delete(ws);
|
||||
}
|
||||
|
||||
export function sendDesktopNotification(userId: unknown, payload: unknown): { attempted: number; sent: number } {
|
||||
const normalizedUserId = normalizeUserId(userId);
|
||||
if (!normalizedUserId) return { attempted: 0, sent: 0 };
|
||||
|
||||
const clients = getUserClients(normalizedUserId);
|
||||
if (!clients?.size) return { attempted: 0, sent: 0 };
|
||||
|
||||
const enabledEndpointIds = new Set(
|
||||
notificationChannelEndpointsDb
|
||||
.getEnabledEndpoints(normalizedUserId, DESKTOP_CHANNEL)
|
||||
.map((endpoint) => endpoint.endpoint_id)
|
||||
);
|
||||
|
||||
const message = JSON.stringify({
|
||||
type: 'notification',
|
||||
id: typeof (payload as any)?.data?.tag === 'string' ? (payload as any).data.tag : `${Date.now()}`,
|
||||
payload,
|
||||
});
|
||||
|
||||
let attempted = 0;
|
||||
let sent = 0;
|
||||
for (const [endpointId, ws] of clients.entries()) {
|
||||
if (!enabledEndpointIds.has(endpointId)) continue;
|
||||
attempted += 1;
|
||||
if (ws.readyState !== ws.OPEN) {
|
||||
unregisterDesktopNotificationClient(ws);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
ws.send(message);
|
||||
notificationChannelEndpointsDb.touchEndpoint(normalizedUserId, DESKTOP_CHANNEL, endpointId);
|
||||
sent += 1;
|
||||
} catch {
|
||||
unregisterDesktopNotificationClient(ws);
|
||||
}
|
||||
}
|
||||
|
||||
return { attempted, sent };
|
||||
}
|
||||
@@ -0,0 +1,288 @@
|
||||
import webPush from 'web-push';
|
||||
|
||||
import { notificationPreferencesDb, pushSubscriptionsDb, sessionsDb } from '@/modules/database/index.js';
|
||||
import { sendDesktopNotification as sendDesktopNotificationToClients } from '@/modules/notifications/services/desktop-notification-clients.service.js';
|
||||
|
||||
const KIND_TO_PREF_KEY = {
|
||||
action_required: 'actionRequired',
|
||||
stop: 'stop',
|
||||
error: 'error'
|
||||
};
|
||||
|
||||
const PROVIDER_LABELS = {
|
||||
claude: 'Claude',
|
||||
cursor: 'Cursor',
|
||||
codex: 'Codex',
|
||||
gemini: 'Gemini',
|
||||
system: 'System'
|
||||
};
|
||||
|
||||
const recentEventKeys = new Map();
|
||||
const DEDUPE_WINDOW_MS = 20000;
|
||||
|
||||
const cleanupOldEventKeys = () => {
|
||||
const now = Date.now();
|
||||
for (const [key, timestamp] of recentEventKeys.entries()) {
|
||||
if (now - timestamp > DEDUPE_WINDOW_MS) {
|
||||
recentEventKeys.delete(key);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
function isNotificationEventEnabled(preferences, event) {
|
||||
const prefEventKey = KIND_TO_PREF_KEY[event.kind];
|
||||
const eventEnabled = prefEventKey ? Boolean(preferences?.events?.[prefEventKey]) : true;
|
||||
|
||||
return eventEnabled;
|
||||
}
|
||||
|
||||
function isDuplicate(event) {
|
||||
cleanupOldEventKeys();
|
||||
const key = event.dedupeKey || `${event.provider}:${event.kind || 'info'}:${event.code || 'generic'}:${event.sessionId || 'none'}`;
|
||||
if (recentEventKeys.has(key)) {
|
||||
return true;
|
||||
}
|
||||
recentEventKeys.set(key, Date.now());
|
||||
return false;
|
||||
}
|
||||
|
||||
function createNotificationEvent({
|
||||
provider,
|
||||
sessionId = null,
|
||||
kind = 'info',
|
||||
code = 'generic.info',
|
||||
meta = {},
|
||||
severity = 'info',
|
||||
dedupeKey = null,
|
||||
requiresUserAction = false
|
||||
}) {
|
||||
return {
|
||||
provider,
|
||||
sessionId,
|
||||
kind,
|
||||
code,
|
||||
meta,
|
||||
severity,
|
||||
requiresUserAction,
|
||||
dedupeKey,
|
||||
createdAt: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeErrorMessage(error) {
|
||||
if (typeof error === 'string') {
|
||||
return error;
|
||||
}
|
||||
|
||||
if (error && typeof error.message === 'string') {
|
||||
return error.message;
|
||||
}
|
||||
|
||||
if (error == null) {
|
||||
return 'Unknown error';
|
||||
}
|
||||
|
||||
return String(error);
|
||||
}
|
||||
|
||||
function normalizeSessionName(sessionName) {
|
||||
if (typeof sessionName !== 'string') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const normalized = sessionName.replace(/\s+/g, ' ').trim();
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalized.length > 80 ? `${normalized.slice(0, 77)}...` : normalized;
|
||||
}
|
||||
|
||||
function rowMatchesProvider(row, provider) {
|
||||
return row && (!provider || row.provider === provider);
|
||||
}
|
||||
|
||||
function resolveSessionRow(sessionId, provider) {
|
||||
if (!sessionId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const appSessionRow = sessionsDb.getSessionById(sessionId);
|
||||
if (rowMatchesProvider(appSessionRow, provider)) {
|
||||
return appSessionRow;
|
||||
}
|
||||
|
||||
const providerSessionRow = sessionsDb.getSessionByProviderSessionId(sessionId);
|
||||
if (rowMatchesProvider(providerSessionRow, provider)) {
|
||||
return providerSessionRow;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function normalizeNotificationSession(event) {
|
||||
if (!event?.sessionId || !event.provider || event.provider === 'system') {
|
||||
return event;
|
||||
}
|
||||
|
||||
const row = resolveSessionRow(event.sessionId, event.provider);
|
||||
if (!row || row.session_id === event.sessionId) {
|
||||
return event;
|
||||
}
|
||||
|
||||
return {
|
||||
...event,
|
||||
sessionId: row.session_id
|
||||
};
|
||||
}
|
||||
|
||||
function resolveSessionName(event) {
|
||||
const explicitSessionName = normalizeSessionName(event.meta?.sessionName);
|
||||
if (explicitSessionName) {
|
||||
return explicitSessionName;
|
||||
}
|
||||
|
||||
if (!event.sessionId || !event.provider) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalizeSessionName(sessionsDb.getSessionName(event.sessionId, event.provider));
|
||||
}
|
||||
|
||||
function buildNotificationPayload(event) {
|
||||
const normalizedEvent = normalizeNotificationSession(event);
|
||||
const CODE_MAP = {
|
||||
'permission.required': normalizedEvent.meta?.toolName
|
||||
? `Action Required: Tool "${normalizedEvent.meta.toolName}" needs approval`
|
||||
: 'Action Required: A tool needs your approval',
|
||||
'run.stopped': normalizedEvent.meta?.stopReason || 'Run Stopped: The run has stopped',
|
||||
'run.failed': normalizedEvent.meta?.error ? `Run Failed: ${normalizedEvent.meta.error}` : 'Run Failed: The run encountered an error',
|
||||
'agent.notification': normalizedEvent.meta?.message ? String(normalizedEvent.meta.message) : 'You have a new notification',
|
||||
'push.enabled': 'Push notifications are now enabled!'
|
||||
};
|
||||
const providerLabel = PROVIDER_LABELS[normalizedEvent.provider] || 'Assistant';
|
||||
const sessionName = resolveSessionName(normalizedEvent);
|
||||
const message = CODE_MAP[normalizedEvent.code] || 'You have a new notification';
|
||||
|
||||
return {
|
||||
title: sessionName || 'CloudCLI',
|
||||
body: `${providerLabel}: ${message}`,
|
||||
data: {
|
||||
sessionId: normalizedEvent.sessionId || null,
|
||||
code: normalizedEvent.code,
|
||||
provider: normalizedEvent.provider || null,
|
||||
sessionName,
|
||||
tag: `${normalizedEvent.provider || 'assistant'}:${normalizedEvent.sessionId || 'none'}:${normalizedEvent.code}`
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function sendWebPushPayload(userId, payload) {
|
||||
const subscriptions = pushSubscriptionsDb.getSubscriptions(userId);
|
||||
if (!subscriptions.length) return Promise.resolve();
|
||||
|
||||
const serializedPayload = JSON.stringify(payload);
|
||||
return Promise.allSettled(
|
||||
subscriptions.map((sub) =>
|
||||
webPush.sendNotification(
|
||||
{
|
||||
endpoint: sub.endpoint,
|
||||
keys: {
|
||||
p256dh: sub.keys_p256dh,
|
||||
auth: sub.keys_auth
|
||||
}
|
||||
},
|
||||
serializedPayload
|
||||
)
|
||||
)
|
||||
).then((results) => {
|
||||
results.forEach((result, index) => {
|
||||
if (result.status === 'rejected') {
|
||||
const statusCode = result.reason?.statusCode;
|
||||
if (statusCode === 410 || statusCode === 404) {
|
||||
pushSubscriptionsDb.removeSubscription(subscriptions[index].endpoint);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const notificationChannels = [
|
||||
{
|
||||
id: 'webPush',
|
||||
// TODO: Web push still uses push_subscriptions. Do not remove that table until
|
||||
// browser push subscriptions are migrated into notification_channel_endpoints.
|
||||
isEnabled: (preferences) => Boolean(preferences?.channels?.webPush),
|
||||
send: ({ userId, payload }) => sendWebPushPayload(userId, payload)
|
||||
},
|
||||
{
|
||||
id: 'desktop',
|
||||
isEnabled: (preferences) => Boolean(preferences?.channels?.desktop),
|
||||
send: ({ userId, payload }) => sendDesktopNotificationToClients(userId, payload)
|
||||
}
|
||||
];
|
||||
|
||||
function notifyUserIfEnabled({ userId, event }) {
|
||||
if (!userId || !event) {
|
||||
return;
|
||||
}
|
||||
|
||||
const normalizedEvent = normalizeNotificationSession(event);
|
||||
const preferences = notificationPreferencesDb.getPreferences(userId);
|
||||
if (!isNotificationEventEnabled(preferences, normalizedEvent)) {
|
||||
return;
|
||||
}
|
||||
if (isDuplicate(normalizedEvent)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = buildNotificationPayload(normalizedEvent);
|
||||
for (const channel of notificationChannels) {
|
||||
if (!channel.isEnabled(preferences)) {
|
||||
continue;
|
||||
}
|
||||
Promise.resolve(channel.send({ userId, event: normalizedEvent, payload })).catch((err) => {
|
||||
console.error(`Notification channel "${channel.id}" send error:`, err);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function notifyRunStopped({ userId, provider, sessionId = null, stopReason = 'completed', sessionName = null }) {
|
||||
notifyUserIfEnabled({
|
||||
userId,
|
||||
event: createNotificationEvent({
|
||||
provider,
|
||||
sessionId,
|
||||
kind: 'stop',
|
||||
code: 'run.stopped',
|
||||
meta: { stopReason, sessionName },
|
||||
severity: 'info',
|
||||
dedupeKey: `${provider}:run:stop:${sessionId || 'none'}:${stopReason}`
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
function notifyRunFailed({ userId, provider, sessionId = null, error, sessionName = null }) {
|
||||
const errorMessage = normalizeErrorMessage(error);
|
||||
|
||||
notifyUserIfEnabled({
|
||||
userId,
|
||||
event: createNotificationEvent({
|
||||
provider,
|
||||
sessionId,
|
||||
kind: 'error',
|
||||
code: 'run.failed',
|
||||
meta: { error: errorMessage, sessionName },
|
||||
severity: 'error',
|
||||
dedupeKey: `${provider}:run:error:${sessionId || 'none'}:${errorMessage}`
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
export {
|
||||
buildNotificationPayload,
|
||||
createNotificationEvent,
|
||||
notifyUserIfEnabled,
|
||||
notifyRunStopped,
|
||||
notifyRunFailed
|
||||
};
|
||||
@@ -0,0 +1,109 @@
|
||||
import type { WebSocket } from 'ws';
|
||||
|
||||
import {
|
||||
registerDesktopNotificationClient,
|
||||
unregisterDesktopNotificationClient,
|
||||
} from '@/modules/notifications/services/desktop-notification-clients.service.js';
|
||||
import type { AuthenticatedWebSocketRequest } from '@/shared/types.js';
|
||||
import { parseIncomingJsonObject } from '@/shared/utils.js';
|
||||
|
||||
type DesktopNotificationRegisterMessage = {
|
||||
type?: unknown;
|
||||
kind?: unknown;
|
||||
deviceId?: unknown;
|
||||
label?: unknown;
|
||||
platform?: unknown;
|
||||
appVersion?: unknown;
|
||||
};
|
||||
|
||||
function readRequestUserId(request: AuthenticatedWebSocketRequest): number | null {
|
||||
const user = request.user;
|
||||
const rawUserId = typeof user?.id === 'number' || typeof user?.id === 'string'
|
||||
? user.id
|
||||
: typeof user?.userId === 'number' || typeof user?.userId === 'string'
|
||||
? user.userId
|
||||
: null;
|
||||
const numericUserId = Number(rawUserId);
|
||||
return Number.isInteger(numericUserId) && numericUserId > 0 ? numericUserId : null;
|
||||
}
|
||||
|
||||
function readOptionalString(value: unknown): string | null {
|
||||
if (typeof value !== 'string') return null;
|
||||
const normalized = value.trim();
|
||||
return normalized || null;
|
||||
}
|
||||
|
||||
function sendJson(ws: WebSocket, payload: unknown): void {
|
||||
if (ws.readyState === ws.OPEN) {
|
||||
ws.send(JSON.stringify(payload));
|
||||
}
|
||||
}
|
||||
|
||||
export function handleDesktopNotificationsConnection(
|
||||
ws: WebSocket,
|
||||
request: AuthenticatedWebSocketRequest
|
||||
): void {
|
||||
const userId = readRequestUserId(request);
|
||||
if (!userId) {
|
||||
ws.close(1008, 'Missing authenticated user');
|
||||
return;
|
||||
}
|
||||
|
||||
let registered = false;
|
||||
|
||||
ws.on('message', (rawMessage) => {
|
||||
const data = parseIncomingJsonObject(rawMessage) as DesktopNotificationRegisterMessage | null;
|
||||
if (!data) {
|
||||
return;
|
||||
}
|
||||
|
||||
const type = typeof data.type === 'string' ? data.type : typeof data.kind === 'string' ? data.kind : '';
|
||||
if (type === 'notification_ack') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (type !== 'register' || registered) {
|
||||
return;
|
||||
}
|
||||
|
||||
const deviceId = readOptionalString(data.deviceId);
|
||||
if (!deviceId) {
|
||||
sendJson(ws, {
|
||||
type: 'error',
|
||||
code: 'DEVICE_ID_REQUIRED',
|
||||
message: 'Desktop notification registration requires deviceId.',
|
||||
});
|
||||
ws.close(1008, 'Missing deviceId');
|
||||
return;
|
||||
}
|
||||
|
||||
const device = registerDesktopNotificationClient({
|
||||
userId,
|
||||
deviceId,
|
||||
label: readOptionalString(data.label),
|
||||
platform: readOptionalString(data.platform),
|
||||
appVersion: readOptionalString(data.appVersion),
|
||||
ws,
|
||||
});
|
||||
|
||||
if (!device) {
|
||||
ws.close(1011, 'Registration failed');
|
||||
return;
|
||||
}
|
||||
|
||||
registered = true;
|
||||
sendJson(ws, {
|
||||
type: 'registered',
|
||||
deviceId: device.endpoint_id,
|
||||
enabled: Boolean(device.enabled),
|
||||
});
|
||||
});
|
||||
|
||||
ws.on('close', () => {
|
||||
unregisterDesktopNotificationClient(ws);
|
||||
});
|
||||
|
||||
ws.on('error', () => {
|
||||
unregisterDesktopNotificationClient(ws);
|
||||
});
|
||||
}
|
||||
@@ -430,6 +430,17 @@ router.post(
|
||||
}),
|
||||
);
|
||||
|
||||
router.delete(
|
||||
'/:provider/skills/:directoryName',
|
||||
asyncHandler(async (req: Request, res: Response) => {
|
||||
const provider = parseProvider(req.params.provider);
|
||||
const result = await providerSkillsService.removeProviderSkill(provider, {
|
||||
directoryName: readPathParam(req.params.directoryName, 'directoryName'),
|
||||
});
|
||||
res.json(createApiSuccessResponse(result));
|
||||
}),
|
||||
);
|
||||
|
||||
// ----------------- MCP routes -----------------
|
||||
router.get(
|
||||
'/:provider/mcp/servers',
|
||||
|
||||
@@ -3,6 +3,7 @@ import type {
|
||||
ProviderSkill,
|
||||
ProviderSkillCreateInput,
|
||||
ProviderSkillListOptions,
|
||||
ProviderSkillRemoveInput,
|
||||
} from '@/shared/types.js';
|
||||
|
||||
export const providerSkillsService = {
|
||||
@@ -27,4 +28,12 @@ export const providerSkillsService = {
|
||||
const provider = providerRegistry.resolveProvider(providerName);
|
||||
return provider.skills.addSkills(input);
|
||||
},
|
||||
|
||||
async removeProviderSkill(
|
||||
providerName: string,
|
||||
input: ProviderSkillRemoveInput,
|
||||
): Promise<{ removed: boolean; provider: string; directoryName: string }> {
|
||||
const provider = providerRegistry.resolveProvider(providerName);
|
||||
return provider.skills.removeSkill(input);
|
||||
},
|
||||
};
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import path from 'node:path';
|
||||
import { mkdir, rm, writeFile } from 'node:fs/promises';
|
||||
import { mkdir, rm, stat, writeFile } from 'node:fs/promises';
|
||||
|
||||
import type { IProviderSkills } from '@/shared/interfaces.js';
|
||||
import type {
|
||||
LLMProvider,
|
||||
ProviderSkillCreateInput,
|
||||
ProviderSkillRemoveInput,
|
||||
ProviderSkill,
|
||||
ProviderSkillListOptions,
|
||||
ProviderSkillSource,
|
||||
@@ -236,6 +237,48 @@ export abstract class SkillsProvider implements IProviderSkills {
|
||||
return pendingInstalls.map((install) => install.skill);
|
||||
}
|
||||
|
||||
async removeSkill(
|
||||
input: ProviderSkillRemoveInput,
|
||||
): Promise<{ removed: boolean; provider: LLMProvider; directoryName: string }> {
|
||||
const globalSkillSource = await this.getGlobalSkillSource();
|
||||
if (!globalSkillSource) {
|
||||
throw new AppError(`${this.provider} does not support managed global skills.`, {
|
||||
code: 'PROVIDER_SKILLS_WRITE_UNSUPPORTED',
|
||||
statusCode: 400,
|
||||
});
|
||||
}
|
||||
|
||||
const directoryName = normalizeSkillDirectoryName(input.directoryName);
|
||||
if (!directoryName) {
|
||||
throw new AppError('Skill directoryName is required.', {
|
||||
code: 'PROVIDER_SKILL_DIRECTORY_REQUIRED',
|
||||
statusCode: 400,
|
||||
});
|
||||
}
|
||||
|
||||
const skillDirectoryPath = path.join(globalSkillSource.rootDir, directoryName);
|
||||
const resolvedRoot = path.resolve(globalSkillSource.rootDir);
|
||||
const resolvedSkillDirectoryPath = path.resolve(skillDirectoryPath);
|
||||
if (
|
||||
resolvedSkillDirectoryPath !== resolvedRoot
|
||||
&& !resolvedSkillDirectoryPath.startsWith(`${resolvedRoot}${path.sep}`)
|
||||
) {
|
||||
throw new AppError('Skill directory must stay inside the managed skill root.', {
|
||||
code: 'PROVIDER_SKILL_DIRECTORY_INVALID',
|
||||
statusCode: 400,
|
||||
});
|
||||
}
|
||||
|
||||
const removed = await stat(resolvedSkillDirectoryPath)
|
||||
.then((stats) => stats.isDirectory())
|
||||
.catch(() => false);
|
||||
if (removed) {
|
||||
await rm(resolvedSkillDirectoryPath, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
return { removed, provider: this.provider, directoryName };
|
||||
}
|
||||
|
||||
protected abstract getSkillSources(workspacePath: string): Promise<ProviderSkillSource[]>;
|
||||
|
||||
protected async getGlobalSkillSource(): Promise<ProviderSkillSource | null> {
|
||||
|
||||
@@ -662,6 +662,19 @@ test('providerSkillsService adds global skills for claude, codex, gemini, and cu
|
||||
const listedCursorSkills = await providerSkillsService.listProviderSkills('cursor');
|
||||
assert.equal(listedCursorSkills.some((skill) => skill.name === 'cursor-global'), true);
|
||||
|
||||
const removedCodexSkill = await providerSkillsService.removeProviderSkill('codex', {
|
||||
directoryName: 'uploaded-codex-folder',
|
||||
});
|
||||
assert.equal(removedCodexSkill.removed, true);
|
||||
assert.equal(removedCodexSkill.provider, 'codex');
|
||||
assert.equal(removedCodexSkill.directoryName, 'uploaded-codex-folder');
|
||||
await assert.rejects(fs.stat(path.dirname(createdCodexSkill.sourcePath)), { code: 'ENOENT' });
|
||||
|
||||
const removedMissingSkill = await providerSkillsService.removeProviderSkill('codex', {
|
||||
directoryName: 'uploaded-codex-folder',
|
||||
});
|
||||
assert.equal(removedMissingSkill.removed, false);
|
||||
|
||||
await assert.rejects(
|
||||
providerSkillsService.addProviderSkills('codex', {
|
||||
entries: [
|
||||
@@ -701,4 +714,11 @@ test('providerSkillsService rejects managed skill creation for opencode', { conc
|
||||
}),
|
||||
/does not support managed global skills/i,
|
||||
);
|
||||
|
||||
await assert.rejects(
|
||||
providerSkillsService.removeProviderSkill('opencode', {
|
||||
directoryName: 'opencode-global-dir',
|
||||
}),
|
||||
/does not support managed global skills/i,
|
||||
);
|
||||
});
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
import type { WebSocket } from 'ws';
|
||||
|
||||
import { desktopAgentRelay } from '@/modules/computer-use/index.js';
|
||||
import type { AuthenticatedWebSocketRequest } from '@/shared/types.js';
|
||||
import { parseIncomingJsonObject } from '@/shared/utils.js';
|
||||
|
||||
/**
|
||||
* Handles the `/desktop-agent` websocket — the inbound side of the cloud
|
||||
* Computer Use relay. A linked CloudCLI desktop app connects here and registers
|
||||
* itself as the executor for this hosted environment. The server then forwards
|
||||
* `computer_*` actions via `desktopAgentRelay`, and the agent returns results as
|
||||
* `computer_relay_result` frames correlated by `id`.
|
||||
*/
|
||||
export function handleDesktopAgentConnection(
|
||||
ws: WebSocket,
|
||||
request: AuthenticatedWebSocketRequest
|
||||
): void {
|
||||
let registered = false;
|
||||
|
||||
ws.on('message', (rawMessage) => {
|
||||
const data = parseIncomingJsonObject(rawMessage);
|
||||
if (!data) {
|
||||
return;
|
||||
}
|
||||
const kind = typeof data.kind === 'string' ? data.kind : typeof data.type === 'string' ? data.type : '';
|
||||
if (kind === 'register' && !registered) {
|
||||
const label = typeof data.label === 'string' && data.label.trim()
|
||||
? data.label.trim()
|
||||
: request.user?.username
|
||||
? `desktop:${request.user.username}`
|
||||
: 'desktop-agent';
|
||||
registered = true;
|
||||
console.log('[INFO] Desktop agent websocket registered:', label);
|
||||
desktopAgentRelay.register(ws, label);
|
||||
return;
|
||||
}
|
||||
if (kind === 'computer_relay_result' && typeof data.id === 'string') {
|
||||
desktopAgentRelay.handleResult(
|
||||
data.id,
|
||||
(data as Record<string, unknown>).result,
|
||||
typeof (data as Record<string, unknown>).error === 'string'
|
||||
? ((data as Record<string, unknown>).error as string)
|
||||
: undefined
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('close', () => {
|
||||
console.log('[INFO] Desktop agent websocket disconnected');
|
||||
});
|
||||
}
|
||||
@@ -6,6 +6,8 @@ import { handleChatConnection } from '@/modules/websocket/services/chat-websocke
|
||||
import { verifyWebSocketClient } from '@/modules/websocket/services/websocket-auth.service.js';
|
||||
import { handlePluginWsProxy } from '@/modules/websocket/services/plugin-websocket-proxy.service.js';
|
||||
import { handleShellConnection } from '@/modules/websocket/services/shell-websocket.service.js';
|
||||
import { handleDesktopAgentConnection } from '@/modules/websocket/services/desktop-agent-websocket.service.js';
|
||||
import { handleDesktopNotificationsConnection } from '@/modules/notifications/index.js';
|
||||
import type { AuthenticatedWebSocketRequest } from '@/shared/types.js';
|
||||
|
||||
type WebSocketServerDependencies = {
|
||||
@@ -63,6 +65,16 @@ export function createWebSocketServer(
|
||||
return;
|
||||
}
|
||||
|
||||
if (pathname === '/desktop-agent') {
|
||||
handleDesktopAgentConnection(ws, incomingRequest);
|
||||
return;
|
||||
}
|
||||
|
||||
if (pathname === '/desktop-notifications') {
|
||||
handleDesktopNotificationsConnection(ws, incomingRequest);
|
||||
return;
|
||||
}
|
||||
|
||||
if (pathname.startsWith('/plugin-ws/')) {
|
||||
handlePluginWsProxy(ws, pathname, dependencies.getPluginPort);
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user