feat: add CloudCLI computer use semantics, desktop helper packaging, and permission onboarding

This commit is contained in:
Simos Mikelatos
2026-06-19 12:09:55 +00:00
parent a35200f340
commit 1726705459
37 changed files with 3036 additions and 426 deletions

View File

@@ -38,7 +38,7 @@ jobs:
run: |
SAFE_REF="$(printf '%s' "${GITHUB_REF_NAME}" | tr -c 'A-Za-z0-9._-' '-')"
echo "name=CloudCLI-macOS-${SAFE_REF}-${GITHUB_RUN_NUMBER}" >> "$GITHUB_OUTPUT"
echo "server_bundle_tag=desktop-server-${SAFE_REF}" >> "$GITHUB_OUTPUT"
echo "server_bundle_tag=cloudcli-local-server-${SAFE_REF}" >> "$GITHUB_OUTPUT"
- name: Configure branch server bundle source
run: printf '{"releaseTag":"%s"}\n' "${{ steps.artifact.outputs.server_bundle_tag }}" > electron/server-bundle-config.json
@@ -60,6 +60,7 @@ jobs:
- name: Build signed and notarized macOS artifacts
run: npm run desktop:dist:mac -- --publish never
env:
CLOUDCLI_SEMANTICS_BUILD_REQUIRED: "1"
CSC_LINK: ${{ secrets.CSC_LINK }}
CSC_KEY_PASSWORD: ${{ secrets.CSC_KEY_PASSWORD }}
APPLE_ID: ${{ secrets.APPLE_ID }}
@@ -69,22 +70,31 @@ jobs:
- name: Build branch server bundle
run: node scripts/release/build-server-bundle.js
- name: Verify branch server runtime artifacts
run: |
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz' -print -quit)"
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz.sha256' -print -quit)"
- name: Publish branch server bundle
uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2
with:
tag_name: ${{ steps.artifact.outputs.server_bundle_tag }}
name: CloudCLI Desktop Server Bundle (${{ github.ref_name }})
name: CloudCLI Internal Local Runtime (${{ github.ref_name }})
body: |
Internal runtime assets for CloudCLI Desktop branch builds.
Users should download the desktop app from the workflow artifact. The desktop app downloads these runtime bundles automatically when local mode is enabled.
prerelease: true
fail_on_unmatched_files: false
overwrite_files: true
files: |
release/server-bundles/*
release/local-server/*
- name: Verify macOS artifacts
run: |
test -n "$(find release -maxdepth 1 -name '*.dmg' -print -quit)"
test -n "$(find release -maxdepth 1 -name '*.zip' -print -quit)"
shasum -a 256 release/*.{dmg,zip} > release/SHASUMS256.txt
test -n "$(find release/desktop -maxdepth 1 -name '*.dmg' -print -quit)"
test -n "$(find release/desktop -maxdepth 1 -name '*.zip' -print -quit)"
shasum -a 256 release/desktop/*.{dmg,zip} > release/SHASUMS256.txt
cat release/SHASUMS256.txt
- name: Upload branch build artifacts
@@ -92,10 +102,10 @@ jobs:
with:
name: ${{ steps.artifact.outputs.name }}
path: |
release/*.dmg
release/*.zip
release/*.yml
release/*.blockmap
release/desktop/*.dmg
release/desktop/*.zip
release/desktop/*.yml
release/desktop/*.blockmap
release/SHASUMS256.txt
if-no-files-found: error
retention-days: 14

View File

@@ -61,6 +61,10 @@ jobs:
echo "tag=$TAG" >> "$GITHUB_OUTPUT"
echo "release_name=$RELEASE_NAME" >> "$GITHUB_OUTPUT"
echo "server_bundle_tag=cloudcli-local-server-${TAG}" >> "$GITHUB_OUTPUT"
- name: Configure release server bundle source
run: printf '{"releaseTag":"%s"}\n' "${{ steps.release.outputs.server_bundle_tag }}" > electron/server-bundle-config.json
- name: Verify signing secrets are configured
run: |
@@ -79,6 +83,7 @@ jobs:
- name: Build signed and notarized macOS artifacts
run: npm run desktop:dist:mac -- --publish never
env:
CLOUDCLI_SEMANTICS_BUILD_REQUIRED: "1"
CSC_LINK: ${{ secrets.CSC_LINK }}
CSC_KEY_PASSWORD: ${{ secrets.CSC_KEY_PASSWORD }}
APPLE_ID: ${{ secrets.APPLE_ID }}
@@ -88,12 +93,32 @@ jobs:
- name: Build local server bundle
run: node scripts/release/build-server-bundle.js
- name: Verify local server runtime artifacts
run: |
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz' -print -quit)"
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz.sha256' -print -quit)"
- name: Publish local server runtime assets
uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2
with:
tag_name: ${{ steps.release.outputs.server_bundle_tag }}
target_commitish: ${{ github.sha }}
name: CloudCLI Local Server Runtime (${{ steps.release.outputs.tag }})
body: |
Internal runtime assets for CloudCLI Desktop local mode.
Users should download CloudCLI Desktop from the main ${{ steps.release.outputs.tag }} release. The desktop app downloads these runtime bundles automatically when local mode is enabled.
prerelease: true
fail_on_unmatched_files: false
overwrite_files: true
files: |
release/local-server/*
- name: Verify macOS artifacts
run: |
test -n "$(find release -maxdepth 1 -name '*.dmg' -print -quit)"
test -n "$(find release -maxdepth 1 -name '*.zip' -print -quit)"
test -n "$(find release/server-bundles -maxdepth 1 -name 'cloudcli-server-*.tar.gz' -print -quit)"
shasum -a 256 release/*.{dmg,zip} release/server-bundles/* > release/SHASUMS256.txt
test -n "$(find release/desktop -maxdepth 1 -name '*.dmg' -print -quit)"
test -n "$(find release/desktop -maxdepth 1 -name '*.zip' -print -quit)"
shasum -a 256 release/desktop/*.{dmg,zip} > release/SHASUMS256.txt
cat release/SHASUMS256.txt
- name: Publish GitHub release assets
@@ -102,12 +127,15 @@ jobs:
tag_name: ${{ steps.release.outputs.tag }}
target_commitish: ${{ github.sha }}
name: ${{ steps.release.outputs.release_name }}
body: |
Download the CloudCLI Desktop installer for your Mac.
The local server runtime used by local mode is installed automatically by the desktop app. You do not need to download any server bundle manually.
prerelease: ${{ inputs.prerelease }}
fail_on_unmatched_files: false
files: |
release/*.dmg
release/*.zip
release/*.yml
release/*.blockmap
release/server-bundles/*
release/desktop/*.dmg
release/desktop/*.zip
release/desktop/*.yml
release/desktop/*.blockmap
release/SHASUMS256.txt

View File

@@ -39,7 +39,7 @@ jobs:
run: |
SAFE_REF="$(printf '%s' "${GITHUB_REF_NAME}" | tr -c 'A-Za-z0-9._-' '-')"
echo "name=CloudCLI-windows-${SAFE_REF}-${GITHUB_RUN_NUMBER}" >> "$GITHUB_OUTPUT"
echo "server_bundle_tag=desktop-server-${SAFE_REF}" >> "$GITHUB_OUTPUT"
echo "server_bundle_tag=cloudcli-local-server-${SAFE_REF}" >> "$GITHUB_OUTPUT"
- name: Configure branch server bundle source
shell: bash
@@ -48,28 +48,39 @@ jobs:
- name: Build unsigned Windows artifacts
run: npm run desktop:dist:win -- --publish never
env:
CLOUDCLI_SEMANTICS_BUILD_REQUIRED: "1"
CSC_IDENTITY_AUTO_DISCOVERY: "false"
- name: Build branch server bundle
run: node scripts/release/build-server-bundle.js
- name: Verify branch server runtime artifacts
shell: bash
run: |
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz' -print -quit)"
test -n "$(find release/local-server -maxdepth 1 -name 'cloudcli-local-server-*.tar.gz.sha256' -print -quit)"
- name: Publish branch server bundle
uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2
with:
tag_name: ${{ steps.artifact.outputs.server_bundle_tag }}
name: CloudCLI Desktop Server Bundle (${{ github.ref_name }})
name: CloudCLI Internal Local Runtime (${{ github.ref_name }})
body: |
Internal runtime assets for CloudCLI Desktop branch builds.
Users should download the desktop app from the workflow artifact. The desktop app downloads these runtime bundles automatically when local mode is enabled.
prerelease: true
fail_on_unmatched_files: false
overwrite_files: true
files: |
release/server-bundles/*
release/local-server/*
- name: Verify Windows artifacts
shell: bash
run: |
test -n "$(find release -maxdepth 1 -name '*.exe' -print -quit)"
test -n "$(find release -maxdepth 1 -name '*.zip' -print -quit)"
sha256sum release/*.{exe,zip} > release/SHASUMS256.txt
test -n "$(find release/desktop -maxdepth 1 -name '*.exe' -print -quit)"
test -n "$(find release/desktop -maxdepth 1 -name '*.zip' -print -quit)"
sha256sum release/desktop/*.{exe,zip} > release/SHASUMS256.txt
cat release/SHASUMS256.txt
- name: Upload branch build artifacts
@@ -77,10 +88,10 @@ jobs:
with:
name: ${{ steps.artifact.outputs.name }}
path: |
release/*.exe
release/*.zip
release/*.yml
release/*.blockmap
release/desktop/*.exe
release/desktop/*.zip
release/desktop/*.yml
release/desktop/*.blockmap
release/SHASUMS256.txt
if-no-files-found: error
retention-days: 14

View File

@@ -14,7 +14,74 @@ on:
type: string
jobs:
build-macos-semantic-helper:
strategy:
fail-fast: false
matrix:
include:
- runs_on: macos-15
target_dir: darwin-arm64
- runs_on: macos-15-intel
target_dir: darwin-x64
runs-on: ${{ matrix.runs_on }}
steps:
- uses: actions/checkout@v6
- uses: actions/setup-node@v6
with:
node-version: 22
- name: Build macOS semantic helper
run: node scripts/build-computer-semantics.mjs
env:
CLOUDCLI_SEMANTICS_BUILD_REQUIRED: "1"
- name: Verify macOS semantic helper target
run: test -x "server/modules/computer-use/semantics/bin/${{ matrix.target_dir }}/CloudCLISemantics"
- name: Stage macOS semantic helper artifact
run: |
mkdir -p "semantic-helper-artifact/${{ matrix.target_dir }}"
cp "server/modules/computer-use/semantics/bin/${{ matrix.target_dir }}/CloudCLISemantics" "semantic-helper-artifact/${{ matrix.target_dir }}/"
- uses: actions/upload-artifact@v6
with:
name: semantic-helper-${{ matrix.target_dir }}
path: semantic-helper-artifact/*
if-no-files-found: error
build-windows-semantic-helper:
strategy:
fail-fast: false
matrix:
include:
- runs_on: windows-2025
target_dir: win32-x64
- runs_on: windows-11-arm
target_dir: win32-arm64
runs-on: ${{ matrix.runs_on }}
steps:
- uses: actions/checkout@v6
- uses: actions/setup-node@v6
with:
node-version: 22
- name: Build Windows semantic helper
run: node scripts/build-computer-semantics.mjs
env:
CLOUDCLI_SEMANTICS_BUILD_REQUIRED: "1"
- name: Verify Windows semantic helper target
shell: bash
run: test -f "server/modules/computer-use/semantics/bin/${{ matrix.target_dir }}/CloudCLISemantics.exe"
- name: Stage Windows semantic helper artifact
shell: bash
run: |
mkdir -p "semantic-helper-artifact/${{ matrix.target_dir }}"
cp "server/modules/computer-use/semantics/bin/${{ matrix.target_dir }}/CloudCLISemantics.exe" "semantic-helper-artifact/${{ matrix.target_dir }}/"
- uses: actions/upload-artifact@v6
with:
name: semantic-helper-${{ matrix.target_dir }}
path: semantic-helper-artifact/*
if-no-files-found: error
release:
needs:
- build-macos-semantic-helper
- build-windows-semantic-helper
runs-on: ubuntu-latest
permissions:
contents: write
@@ -37,6 +104,20 @@ jobs:
- run: npm ci
- uses: actions/download-artifact@v6
with:
pattern: semantic-helper-*
path: server/modules/computer-use/semantics/bin
merge-multiple: true
- name: Verify bundled semantic helpers
run: |
test -x server/modules/computer-use/semantics/bin/darwin-arm64/CloudCLISemantics
test -x server/modules/computer-use/semantics/bin/darwin-x64/CloudCLISemantics
test -f server/modules/computer-use/semantics/bin/win32-x64/CloudCLISemantics.exe
test -f server/modules/computer-use/semantics/bin/win32-arm64/CloudCLISemantics.exe
find server/modules/computer-use/semantics/bin -maxdepth 2 -type f -print
- name: Release
run: |
ARGS="--ci --increment=${{ inputs.increment }}"

View File

@@ -421,6 +421,50 @@ svg {
gap: 12px;
}
.cc-permissions {
display: grid;
gap: 10px;
padding: 12px;
border: 1px solid var(--b-subtle);
border-radius: 12px;
background: var(--s1);
}
.cc-note {
color: var(--tx2);
font-size: 12px;
line-height: 1.45;
}
.cc-permission-row {
display: grid;
grid-template-columns: minmax(0, 1fr) auto;
gap: 12px;
align-items: center;
padding-top: 10px;
border-top: 1px solid var(--b-subtle);
}
.cc-permission-title {
color: var(--tx);
font-size: 13px;
font-weight: 600;
}
.cc-permission-detail {
margin-top: 2px;
color: var(--tx2);
font-size: 12px;
line-height: 1.4;
}
.cc-permission-actions {
display: flex;
flex-wrap: wrap;
justify-content: flex-end;
gap: 8px;
}
.cc-kv {
display: flex;
align-items: center;

View File

@@ -9,6 +9,13 @@ window.__MOCK_STATE__ = {
localServerRunning: false,
localStartupLogs: [],
computerUse: { enabled: false, consentMode: 'ask', running: false, connectedCount: 0, targetCount: 0 },
computerUsePermissions: {
platform: 'darwin',
supported: true,
accessibility: 'not_granted',
screenRecording: 'not_determined',
message: 'macOS requires Accessibility and Screen Recording for Computer Use.',
},
environments: [
{ id: 'env-api', name: 'api-gateway', subdomain: 'api-gateway', access_url: 'https://api-gateway.cloudcli.ai', status: 'running', region: 'fra1', agent: 'Claude Code' },
{ id: 'env-web', name: 'web-frontend', subdomain: 'web-frontend', access_url: 'https://web-frontend.cloudcli.ai', status: 'stopped', region: 'sfo1', agent: 'Codex' },
@@ -73,6 +80,16 @@ window.__MOCK_STATE__ = {
mockState.computerUse.running = mockState.computerUse.enabled;
return Promise.resolve(clone(mockState));
},
requestComputerUsePermission: function (permission) {
mockState.computerUsePermissions = mockState.computerUsePermissions || {};
if (permission === 'accessibility') mockState.computerUsePermissions.accessibility = 'granted';
if (permission === 'screen') mockState.computerUsePermissions.screenRecording = 'granted';
if (permission === 'all') {
mockState.computerUsePermissions.accessibility = 'granted';
mockState.computerUsePermissions.screenRecording = 'granted';
}
return Promise.resolve(clone(mockState));
},
openEnvironment: function (id) {
var env = (mockState.environments || []).filter(function (item) { return item.id === id; })[0];
if (env) {
@@ -333,6 +350,10 @@ window.__MOCK_STATE__ = {
consentMode: current.consentMode === 'auto' ? 'auto' : 'ask',
});
});
case 'computer-permission':
return CC.run('Opening permission settings...', function () {
return bridge.requestComputerUsePermission(node.getAttribute('data-cc-computer-permission'));
});
case 'settings-toggle':
return CC.run('Opening desktop settings...', function () { return bridge.showDesktopSettings(); });
case 'desktop-settings-toggle':
@@ -480,12 +501,47 @@ window.__MOCK_STATE__ = {
);
};
function permissionLabel(value) {
if (value === 'granted') return 'Granted';
if (value === 'denied' || value === 'restricted') return 'Needs attention';
if (value === 'not_applicable') return 'Not required';
return 'Not granted';
}
function permissionTone(value) {
if (value === 'granted' || value === 'not_applicable') return 'ok';
if (value === 'denied' || value === 'restricted') return 'warn';
return 'idle';
}
function renderComputerPermissionRow(key, label, detail, status) {
return '<div class="cc-permission-row">' +
'<div><div class="cc-permission-title">' + CC.esc(label) + '</div><div class="cc-permission-detail">' + CC.esc(detail) + '</div></div>' +
'<div class="cc-permission-actions"><span class="badge ' + permissionTone(status) + '">' + CC.esc(permissionLabel(status)) + '</span>' +
(status === 'granted' || status === 'not_applicable'
? ''
: '<button class="btn sm" data-cc-action="computer-permission" data-cc-computer-permission="' + CC.esc(key) + '">Open settings</button>') +
'</div>' +
'</div>';
}
function renderComputerPermissions(state) {
var permissions = state.computerUsePermissions || {};
if (!permissions.supported) {
return '<div class="cc-note">' + CC.esc(permissions.message || 'No additional OS permission setup is required from CloudCLI on this platform.') + '</div>';
}
return '<div class="cc-note">' + CC.esc(permissions.message || 'Grant the required OS permissions before approving agent control.') + '</div>' +
renderComputerPermissionRow('accessibility', 'Accessibility', 'Allows CloudCLI to click, type, and use accessibility actions.', permissions.accessibility) +
renderComputerPermissionRow('screen', 'Screen Recording', 'Allows CloudCLI to capture screenshots for agent observation.', permissions.screenRecording);
}
CC.buildComputerUseSection = function (state) {
var computerUse = state.computerUse || {};
var body =
'<div class="cc-surface">' +
'<label class="cc-toggle"><input type="checkbox" data-cc-computer-enabled="true"' + (computerUse.enabled ? ' checked' : '') + '><span><b>Enable Computer Use</b><br>Let CloudCLI use the computer. Agents cannot act until you approve a session.</span></label>';
if (computerUse.enabled) {
body += '<div class="cc-permissions">' + renderComputerPermissions(state) + '</div>';
body += '<div class="cc-choice-group">' +
CC.renderRadioOption('computer-access-mode', 'ask', computerUse.consentMode !== 'auto', 'Ask before each session', 'Agents can request control, but you approve every session.') +
CC.renderRadioOption('computer-access-mode', 'auto', computerUse.consentMode === 'auto', 'Unattended access', 'Trusted agents can use this computer without a local approval prompt.') +

View File

@@ -1,4 +1,4 @@
import { app, BrowserWindow, clipboard, dialog, ipcMain, shell } from 'electron';
import { app, BrowserWindow, clipboard, dialog, ipcMain, shell, systemPreferences } from 'electron';
import { spawn } from 'node:child_process';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
@@ -141,9 +141,64 @@ function getDesktopState() {
activeTabId: tabs.activeTabId,
environments: cloud.getEnvironments().map(serializeEnvironment),
computerUse: computerAgent?.getState() || { enabled: false, consentMode: 'ask', running: false, connectedCount: 0, targetCount: 0 },
computerUsePermissions: getComputerUsePermissions(),
};
}
function getComputerUsePermissions() {
if (process.platform !== 'darwin') {
return {
platform: process.platform,
supported: false,
accessibility: 'not_applicable',
screenRecording: 'not_applicable',
message: 'No OS permission onboarding is required from CloudCLI on this platform.',
};
}
let accessibility = 'unknown';
let screenRecording = 'unknown';
try {
accessibility = systemPreferences.isTrustedAccessibilityClient(false) ? 'granted' : 'not_granted';
} catch {
accessibility = 'unknown';
}
try {
screenRecording = systemPreferences.getMediaAccessStatus('screen');
} catch {
screenRecording = 'unknown';
}
return {
platform: 'darwin',
supported: true,
accessibility,
screenRecording,
message: accessibility === 'granted' && screenRecording === 'granted'
? 'macOS permissions are granted.'
: 'macOS requires Accessibility and Screen Recording for Computer Use.',
};
}
async function requestComputerUsePermission(permission) {
if (process.platform !== 'darwin') {
return getDesktopState();
}
if (permission === 'accessibility') {
systemPreferences.isTrustedAccessibilityClient(true);
} else if (permission === 'screen') {
await shell.openExternal('x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture');
} else if (permission === 'all') {
systemPreferences.isTrustedAccessibilityClient(true);
await shell.openExternal('x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture');
} else {
throw new Error(`Unknown Computer Use permission: ${permission}`);
}
return getDesktopState();
}
async function openExternalUrl(url) {
if (String(url).startsWith(`${CALLBACK_PROTOCOL}://`)) {
await handleDeepLink(url);
@@ -678,6 +733,7 @@ function registerIpcHandlers() {
return getDesktopState();
});
ipcMain.handle('cloudcli-desktop:update-computer-use', async (_event, settings) => updateComputerUse(settings));
ipcMain.handle('cloudcli-desktop:request-computer-use-permission', async (_event, permission) => requestComputerUsePermission(permission));
ipcMain.handle('cloudcli-desktop:show-desktop-settings', async () => desktopWindow.showDesktopSettings());
ipcMain.handle('cloudcli-desktop:show-local-settings', async () => desktopWindow.showLocalSettings());
ipcMain.handle('cloudcli-desktop:close-settings-window', async () => {

View File

@@ -17,6 +17,7 @@ if (window.location.protocol === 'file:') {
showComputerAccess: () => ipcRenderer.invoke('cloudcli-desktop:show-computer-access'),
showLocalSettings: () => ipcRenderer.invoke('cloudcli-desktop:show-local-settings'),
updateComputerUse: (settings) => ipcRenderer.invoke('cloudcli-desktop:update-computer-use', settings),
requestComputerUsePermission: (permission) => ipcRenderer.invoke('cloudcli-desktop:request-computer-use-permission', permission),
showDesktopSettings: () => ipcRenderer.invoke('cloudcli-desktop:show-desktop-settings'),
closeSettingsWindow: () => ipcRenderer.invoke('cloudcli-desktop:close-settings-window'),
showActiveEnvironmentActionsMenu: () => ipcRenderer.invoke('cloudcli-desktop:show-active-environment-actions-menu'),

View File

@@ -59,7 +59,7 @@ export class ServerInstaller {
}
getBundleName() {
return `cloudcli-server-${this.version}-${this.platform}-${this.arch}.tar.gz`;
return `cloudcli-local-server-${this.version}-${this.platform}-${this.arch}.tar.gz`;
}
getBundleUrl() {

View File

@@ -29,10 +29,13 @@
"scripts": {
"dev": "concurrently --kill-others \"npm run server:dev\" \"npm run client\"",
"server": "node dist-server/server/index.js",
"preserver:dev": "npm run build:semantics",
"server:dev": "tsx --tsconfig server/tsconfig.json server/index.js",
"preserver:dev-watch": "npm run build:semantics",
"server:dev-watch": "tsx watch --tsconfig server/tsconfig.json server/index.js",
"client": "vite",
"desktop": "electron electron/main.js",
"predesktop:dev": "npm run build:semantics",
"desktop:dev": "cross-env ELECTRON_DEV_URL=http://127.0.0.1:5173 electron electron/main.js",
"desktop:stage": "node scripts/release/prepare-desktop-app.js",
"desktop:pack": "npm run build && npm run desktop:stage && electron-builder --projectDir .desktop-build/desktop-app --dir",
@@ -40,10 +43,12 @@
"desktop:dist:win": "npm run build && npm run desktop:stage && electron-builder --projectDir .desktop-build/desktop-app --win nsis zip",
"server:bundle": "npm run build && node scripts/release/build-server-bundle.js",
"desktop:icon:mac": "node electron/scripts/generate-macos-icon.js",
"build": "npm run build:client && npm run build:server",
"build": "npm run build:semantics && npm run build:client && npm run build:server",
"build:client": "vite build",
"build:semantics": "node scripts/build-computer-semantics.mjs",
"prebuild:server": "node -e \"require('node:fs').rmSync('dist-server', { recursive: true, force: true })\"",
"build:server": "tsc -p server/tsconfig.json && tsc-alias -p server/tsconfig.json",
"postbuild:server": "node scripts/copy-computer-semantics-bin.mjs",
"preview": "vite preview",
"typecheck": "tsc --noEmit -p tsconfig.json && tsc --noEmit -p server/tsconfig.json",
"lint": "eslint src/ server/",
@@ -51,7 +56,7 @@
"start": "npm run build && npm run server",
"release": "./release.sh",
"prepublishOnly": "npm run build",
"postinstall": "node scripts/fix-node-pty.js",
"postinstall": "node scripts/fix-node-pty.js && npm run build:semantics",
"prepare": "husky",
"update:platform": "./update-platform.sh"
},
@@ -59,9 +64,9 @@
"appId": "ai.cloudcli.desktop",
"productName": "CloudCLI",
"asar": false,
"artifactName": "CloudCLI-${version}-${arch}.${ext}",
"artifactName": "cloudcli-desktop-${version}-${os}-${arch}.${ext}",
"directories": {
"output": "release"
"output": "release/desktop"
},
"extraMetadata": {
"main": "electron/main.js"

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env node
import { spawn } from 'node:child_process';
import fs from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const rootDir = path.resolve(__dirname, '..');
const platform = process.env.CLOUDCLI_SEMANTICS_PLATFORM || process.platform;
const arch = process.env.CLOUDCLI_SEMANTICS_ARCH || process.arch;
const platformArch = `${platform}-${arch}`;
const semanticsRoot = path.join(rootDir, 'server', 'modules', 'computer-use', 'semantics');
const outDir = path.join(semanticsRoot, 'bin', platformArch);
const requireBuild = process.env.CLOUDCLI_SEMANTICS_BUILD_REQUIRED === '1';
function run(command, args, options = {}) {
return new Promise((resolve, reject) => {
const child = spawn(command, args, {
stdio: 'inherit',
shell: process.platform === 'win32',
...options,
});
child.once('error', reject);
child.once('exit', (code) => {
if (code === 0) resolve();
else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
});
});
}
function commandExists(command) {
return new Promise((resolve) => {
const child = spawn(command, ['--version'], {
stdio: 'ignore',
shell: process.platform === 'win32',
});
child.once('error', () => resolve(false));
child.once('exit', (code) => resolve(code === 0));
});
}
async function pathExists(filePath) {
try {
await fs.access(filePath);
return true;
} catch {
return false;
}
}
async function isUpToDate(output, inputs) {
if (!(await pathExists(output))) {
return false;
}
const outputStat = await fs.stat(output);
for (const input of inputs) {
const inputStat = await fs.stat(input);
if (inputStat.mtimeMs > outputStat.mtimeMs) {
return false;
}
}
return true;
}
async function ensureCommand(command, helpText) {
if (await commandExists(command)) {
return true;
}
const message = `${command} was not found. ${helpText}`;
if (requireBuild) {
throw new Error(message);
}
console.log(`Skipping semantic helper build: ${message}`);
return false;
}
if (platform === 'darwin') {
const source = path.join(semanticsRoot, 'helpers', 'macos', 'CloudCLISemantics.swift');
const output = path.join(outDir, 'CloudCLISemantics');
if (!(await ensureCommand('swiftc', 'Install Xcode Command Line Tools to compile the macOS helper.'))) {
process.exit(0);
}
if (await isUpToDate(output, [source])) {
console.log(`Semantic helper is up to date: ${path.relative(rootDir, output)}`);
process.exit(0);
}
await fs.mkdir(outDir, { recursive: true });
await run('swiftc', [
source,
'-o',
output,
'-framework',
'AppKit',
'-framework',
'ApplicationServices',
]);
await fs.chmod(output, 0o755);
console.log(`Built ${path.relative(rootDir, output)}`);
} else if (platform === 'win32') {
const project = path.join(semanticsRoot, 'helpers', 'windows', 'CloudCLISemantics.csproj');
const source = path.join(semanticsRoot, 'helpers', 'windows', 'Program.cs');
const output = path.join(outDir, 'CloudCLISemantics.exe');
if (!(await ensureCommand('dotnet', '.NET SDK is required to compile the Windows helper.'))) {
process.exit(0);
}
if (await isUpToDate(output, [project, source])) {
console.log(`Semantic helper is up to date: ${path.relative(rootDir, output)}`);
process.exit(0);
}
await fs.mkdir(outDir, { recursive: true });
await run('dotnet', [
'publish',
project,
'-c',
'Release',
'-r',
arch === 'arm64' ? 'win-arm64' : 'win-x64',
'--self-contained',
'false',
'-p:PublishSingleFile=true',
'-o',
outDir,
]);
console.log(`Built ${path.relative(rootDir, output)}`);
} else {
console.log(`Semantic helper build is not supported for ${platform}-${arch}.`);
}

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env node
import fs from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const rootDir = path.resolve(__dirname, '..');
const sourceDir = path.join(rootDir, 'server', 'modules', 'computer-use', 'semantics', 'bin');
const targetDir = path.join(rootDir, 'dist-server', 'server', 'modules', 'computer-use', 'semantics', 'bin');
async function pathExists(filePath) {
try {
await fs.access(filePath);
return true;
} catch {
return false;
}
}
if (await pathExists(sourceDir)) {
await fs.mkdir(path.dirname(targetDir), { recursive: true });
await fs.cp(sourceDir, targetDir, { recursive: true });
console.log(`Copied Computer Use semantic helpers to ${path.relative(rootDir, targetDir)}`);
}

View File

@@ -108,8 +108,8 @@ function sha256(filePath) {
const platform = mapPlatform(process.env.CLOUDCLI_BUNDLE_PLATFORM || process.platform);
const arch = mapArch(process.env.CLOUDCLI_BUNDLE_ARCH || process.arch);
const version = packageJson.version;
const bundleName = `cloudcli-server-${version}-${platform}-${arch}.tar.gz`;
const bundleRoot = path.join(rootDir, 'release', 'server-bundles');
const bundleName = `cloudcli-local-server-${version}-${platform}-${arch}.tar.gz`;
const bundleRoot = path.join(rootDir, 'release', 'local-server');
const stageDir = path.join(bundleRoot, `.stage-${version}-${platform}-${arch}`);
const archivePath = path.join(bundleRoot, bundleName);

View File

@@ -85,7 +85,7 @@ function buildDesktopPackageJson(copiedOptionalDependencies) {
artifactName: packageJson.build.artifactName,
electronVersion: getElectronVersion(),
directories: {
output: '../../release',
output: '../../release/desktop',
},
extraMetadata: {
main: 'electron/main.js',
@@ -116,8 +116,7 @@ await copyRequired('public');
// not the full local server. Local CloudCLI is downloaded on demand.
await copyRequired('dist-server/server/computer-use-agent.js');
await copyIfExists('dist-server/server/computer-use-agent.js.map');
await copyRequired('dist-server/server/modules/computer-use/computer-executor.js');
await copyIfExists('dist-server/server/modules/computer-use/computer-executor.js.map');
await copyRequired('dist-server/server/modules/computer-use');
const copiedRuntimeDependencies = [];
if (await copyNodeModule('ws')) {

View File

@@ -18,14 +18,14 @@ import readline from 'node:readline';
import { WebSocket } from 'ws';
import {
executor,
captureScreenshot,
getRuntimeReadiness,
type ExecutorTarget,
type Point,
type ClickButton,
type ScrollDirection,
} from './modules/computer-use/computer-executor.js';
import { runRawComputerAction } from './modules/computer-use/actions/raw-action-dispatcher.js';
import type { RawActionTarget, RawComputerAction } from './modules/computer-use/actions/raw-action-types.js';
import { computerSemanticsService } from './modules/computer-use/computer-semantics.service.js';
type ConsentMode = 'ask' | 'auto';
@@ -117,69 +117,76 @@ function asPoint(value: unknown): Point | undefined {
return undefined;
}
async function snapshot(target: ExecutorTarget) {
const { dataUrl, size } = await captureScreenshot();
return { screenshotDataUrl: dataUrl, displaySize: size || target.displaySize };
}
async function runAction(type: string, params: Record<string, unknown>): Promise<Record<string, unknown>> {
const readiness = getRuntimeReadiness();
if (!readiness.nutInstalled || !readiness.screenshotInstalled) {
throw new Error('Computer Use runtime is not installed on the desktop agent.');
}
const target: ExecutorTarget = {
displaySize: (params.displaySize as ExecutorTarget['displaySize']) ?? null,
};
function rawActionFromRelay(type: string, params: Record<string, unknown>): RawComputerAction {
const point = asPoint(params.point);
switch (type) {
case 'screenshot':
return snapshot(target);
case 'cursor_position': {
const position = await executor.cursorPosition(target);
return { ...(await snapshot(target)), position, cursor: position };
}
return { type: 'screenshot' };
case 'cursor_position':
return { type: 'cursor_position' };
case 'mouse_move':
if (!point) {
throw new Error('mouse_move requires a valid point.');
}
await executor.moveTo(target, point);
return { ...(await snapshot(target)), cursor: point };
return { type: 'mouse_move', point };
case 'click':
await executor.click(target, (params.button as ClickButton) || 'left', point, params.double === true);
return { ...(await snapshot(target)), cursor: point ?? null };
return {
type: 'click',
button: (params.button as ClickButton) || 'left',
point,
double: params.double === true,
};
case 'drag': {
const from = asPoint(params.from);
const to = asPoint(params.to);
if (!from || !to) {
throw new Error('drag requires valid from and to points.');
}
await executor.drag(target, from, to, (params.button as ClickButton) || 'left');
return { ...(await snapshot(target)), cursor: to };
return { type: 'drag', from, to, button: (params.button as ClickButton) || 'left' };
}
case 'type':
await executor.type(String(params.text ?? ''));
return snapshot(target);
return { type: 'type', text: String(params.text ?? '') };
case 'key':
await executor.pressChord(String(params.key ?? ''));
return snapshot(target);
return { type: 'key', key: String(params.key ?? '') };
case 'scroll':
await executor.scroll(
target,
(params.direction as ScrollDirection) || 'down',
typeof params.amount === 'number' ? params.amount : 3,
return {
type: 'scroll',
direction: (params.direction as ScrollDirection) || 'down',
amount: typeof params.amount === 'number' ? params.amount : 3,
point,
);
return { ...(await snapshot(target)), cursor: point ?? null };
};
case 'wait':
await new Promise((resolve) => setTimeout(resolve, Math.max(0, Math.min(Number(params.ms) || 1000, 10_000))));
return snapshot(target);
return { type: 'wait', ms: typeof params.ms === 'number' ? params.ms : undefined };
default:
throw new Error(`Unsupported computer action: ${type}`);
}
}
async function runAction(type: string, params: Record<string, unknown>): Promise<Record<string, unknown>> {
if (type === 'semantic_tool') {
const toolName = typeof params.toolName === 'string' ? params.toolName : '';
const args = params.arguments && typeof params.arguments === 'object'
? params.arguments as Record<string, unknown>
: {};
const sessionId = typeof params.sessionId === 'string' ? params.sessionId : 'default';
if (!toolName) {
throw new Error('semantic_tool requires toolName.');
}
return await computerSemanticsService.callTool(toolName, { ...args, sessionId }) as Record<string, unknown>;
}
const readiness = getRuntimeReadiness();
if (!readiness.nutInstalled || !readiness.screenshotInstalled) {
throw new Error('Computer Use runtime is not installed on the desktop agent.');
}
const target: RawActionTarget = {
displaySize: (params.displaySize as RawActionTarget['displaySize']) ?? null,
};
return await runRawComputerAction(rawActionFromRelay(type, params), target) as Record<string, unknown>;
}
// --- Relay connection ------------------------------------------------------
function connect(url: string): void {

View File

@@ -21,12 +21,39 @@ const readString = (value: unknown, name: string): string => {
return value.trim();
};
const readOptionalString = (value: unknown): string | undefined =>
typeof value === 'string' && value.trim() !== '' ? value.trim() : undefined;
const readNumber = (value: unknown): number | undefined =>
typeof value === 'number' && Number.isFinite(value) ? value : undefined;
const readMouseButton = (value: unknown): 'left' | 'right' | 'middle' =>
value === 'right' || value === 'middle' ? value : 'left';
const apiUrl = (process.env.CLOUDCLI_COMPUTER_USE_API_URL || 'http://127.0.0.1:3001/api/computer-use-mcp').replace(/\/$/, '');
const apiToken = process.env.CLOUDCLI_COMPUTER_USE_MCP_TOKEN || '';
const computerUseInstructions = `
CloudCLI Computer Use lets you operate the user's real desktop through guarded sessions. Use it deliberately: observe first, act second, then verify.
Recommended app workflow:
1. If you do not know the target app name, call computer_list_apps.
2. Call computer_get_app_state for the target app before app-scoped actions. It returns a screenshot, accessibility elements, and a stateId.
3. Prefer semantic element actions using stateId + element_index from the latest computer_get_app_state result. Do not guess element indexes or reuse them after large UI changes without refreshing state.
4. Use x/y coordinates from the returned screenshot only when no suitable element_index is available.
5. After every action, inspect the returned screenshot/state before deciding the next action.
Use app-scoped tools when the target app is known: computer_list_apps, computer_get_app_state, computer_click_element, computer_perform_secondary_action, computer_set_value, computer_type_text, computer_press_key, computer_scroll_element, and computer_app_drag.
Use raw desktop tools only when you need full-screen coordinate control, cursor position, or current-focus input: computer_screenshot, computer_cursor_position, computer_mouse_move, computer_click, computer_drag, computer_type, computer_key, computer_scroll, computer_wait, and computer_close_session. Raw coordinates are screenshot pixels, so call computer_screenshot first when you need a coordinate frame.
Most tools can use or create the active agent session automatically when sessionId is omitted. In local mode, input actions require the user to grant control in the Computer tab before they work. In cloud mode, approval is handled by the linked CloudCLI desktop app.
If a tool reports missing permission, denied control, or no available desktop session, stop retrying and ask the user to fix access. For local mode, ask them to open CloudCLI Desktop, go to the Computer tab, enable Computer Use, grant the requested OS permissions, and allow the session. On macOS this usually means Accessibility and Screen Recording. For cloud mode, ask them to keep the linked CloudCLI Desktop app running and approve the cloud agent's Computer Use request there.
Ask before sending, deleting, purchasing, approving, uploading, publishing, changing account settings, or making other externally visible or destructive changes. Do not inspect unrelated private content unless the user explicitly asked for that task.
`.trim();
async function callComputerUseApi(toolName: string, input: Record<string, unknown>) {
if (!apiToken) {
throw new Error('CLOUDCLI_COMPUTER_USE_MCP_TOKEN is not configured.');
@@ -105,189 +132,309 @@ function toolResult(value: unknown) {
const sessionIdSchema = {
type: 'object',
properties: {
sessionId: { type: 'string', description: 'Computer Use session id.' },
sessionId: { type: 'string', description: 'Optional. Omit to use or create the active agent session automatically.' },
},
required: ['sessionId'],
};
const pointSchema = {
type: 'object',
properties: {
sessionId: { type: 'string' },
x: { type: 'number', description: 'X coordinate in screenshot pixel space.' },
y: { type: 'number', description: 'Y coordinate in screenshot pixel space.' },
},
required: ['sessionId'],
const optionalSessionProperty = sessionIdSchema.properties.sessionId;
const withOptionalSession = (properties: Record<string, unknown> = {}) => ({
sessionId: optionalSessionProperty,
...properties,
});
const optionalSessionInput = (args: Record<string, unknown>, extra: Record<string, unknown> = {}) => ({
sessionId: readOptionalString(args.sessionId),
...extra,
});
const stateIdProperty = {
type: 'string',
description: 'State id returned by the latest computer_get_app_state call for this app. Send it with element_index so the runtime can resolve the cached element.',
};
const elementIndexProperty = {
type: 'string',
description: 'Element index from the latest computer_get_app_state result for this app. Use with stateId when possible.',
};
const tools: ToolDefinition[] = [
{
name: 'computer_create_session',
description: 'Create a Computer Use session that controls the user desktop. The session starts WITHOUT control: the user must grant control in the Computer panel before any action will work. Returns a screenshot once available.',
inputSchema: { type: 'object', properties: {} },
name: 'computer_list_apps',
description: 'Discover app names, bundle identifiers, process names, and window titles that can be used as the app target for app-scoped Computer Use tools. Call this first when you are unsure which app string to pass to computer_get_app_state.',
inputSchema: { type: 'object', properties: withOptionalSession() },
},
{
name: 'computer_list_sessions',
description: 'List Computer Use sessions and whether the user has granted control.',
inputSchema: { type: 'object', properties: {} },
name: 'computer_get_app_state',
description: 'Inspect a target app and return its current screenshot, accessibility elements, and stateId. Call this before element-targeted actions, after navigation, and whenever the UI may have changed enough that old element indexes could be stale.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'App name, process name, bundle identifier, or window title from computer_list_apps or the user request.' },
}),
required: ['app'],
},
},
{
name: 'computer_click_element',
description: 'Click a target inside an app. Prefer stateId + element_index from computer_get_app_state; use x/y screenshot coordinates only when the target is not represented in the accessibility elements.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
stateId: stateIdProperty,
element_index: elementIndexProperty,
x: { type: 'number', description: 'X coordinate in screenshot pixel coordinates from computer_get_app_state.' },
y: { type: 'number', description: 'Y coordinate in screenshot pixel coordinates from computer_get_app_state.' },
click_count: { type: 'integer', description: 'Number of clicks, usually 1. Defaults to 1 and is capped by the runtime.' },
mouse_button: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Button for the click; omitted means left.' },
}),
required: ['app'],
},
},
{
name: 'computer_perform_secondary_action',
description: 'Open the secondary action for a target inside an app, typically a context menu. Prefer stateId + element_index; if native secondary actions are unavailable, the runtime falls back to a right-click at the resolved point.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
stateId: stateIdProperty,
element_index: elementIndexProperty,
x: { type: 'number', description: 'X coordinate in screenshot pixel coordinates from computer_get_app_state.' },
y: { type: 'number', description: 'Y coordinate in screenshot pixel coordinates from computer_get_app_state.' },
}),
required: ['app'],
},
},
{
name: 'computer_set_value',
description: 'Set the value of a specific editable element in an app. Prefer stateId + element_index for a settable accessibility element; coordinate fallback focuses the resolved point and replaces the current value, so do not call this unless the target is resolved.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
stateId: stateIdProperty,
element_index: elementIndexProperty,
x: { type: 'number', description: 'X coordinate in screenshot pixel coordinates from computer_get_app_state.' },
y: { type: 'number', description: 'Y coordinate in screenshot pixel coordinates from computer_get_app_state.' },
value: { type: 'string', description: 'Exact value to put into the target element.' },
}),
required: ['app', 'value'],
},
},
{
name: 'computer_type_text',
description: 'Type literal text into the target app using keyboard input. Use after you have focused the intended field with computer_click_element or verified the correct focus in computer_get_app_state.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
text: { type: 'string', description: 'Text to enter exactly as provided.' },
}),
required: ['app', 'text'],
},
},
{
name: 'computer_press_key',
description: 'Press a key or key combination in the target app. Use for navigation, shortcuts, and confirmation keys after verifying the intended app/focus.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
key: { type: 'string', description: 'Key or chord, using names such as Return, Escape, Tab, ctrl+s, cmd+a, Up, or Page_Down.' },
}),
required: ['app', 'key'],
},
},
{
name: 'computer_scroll_element',
description: 'Scroll a target area inside an app. Prefer stateId + element_index for scrollable elements; use x/y screenshot coordinates only when the scroll target is visible but not represented as an element.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
stateId: stateIdProperty,
element_index: elementIndexProperty,
x: { type: 'number', description: 'X coordinate in screenshot pixel coordinates from computer_get_app_state.' },
y: { type: 'number', description: 'Y coordinate in screenshot pixel coordinates from computer_get_app_state.' },
direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Direction to scroll the target.' },
pages: { type: 'number', description: 'How far to scroll, measured in page units. Fractional values are allowed; default is 1.' },
}),
required: ['app', 'direction'],
},
},
{
name: 'computer_app_drag',
description: 'Drag inside a target app from one screenshot coordinate to another. Use for sliders, selections, map/canvas gestures, or drag-and-drop when no semantic element action is available.',
inputSchema: {
type: 'object',
properties: withOptionalSession({
app: { type: 'string', description: 'Target app name, process name, bundle identifier, or window title.' },
from_x: { type: 'number', description: 'Start X coordinate in screenshot pixels.' },
from_y: { type: 'number', description: 'Start Y coordinate in screenshot pixels.' },
to_x: { type: 'number', description: 'End X coordinate in screenshot pixels.' },
to_y: { type: 'number', description: 'End Y coordinate in screenshot pixels.' },
}),
required: ['app', 'from_x', 'from_y', 'to_x', 'to_y'],
},
},
{
name: 'computer_screenshot',
description: 'Capture the current desktop screenshot. Returns the image plus the display size to use for coordinates.',
description: 'Capture the full desktop screenshot and current display size. Use this before raw coordinate actions when an app-specific accessibility state is unavailable or the task spans multiple apps.',
inputSchema: sessionIdSchema,
},
{
name: 'computer_cursor_position',
description: 'Get the current mouse cursor position in screenshot pixel space.',
description: 'Get the current mouse cursor position in desktop screenshot pixel coordinates. Useful after a raw action misses or when coordinating pointer-relative steps.',
inputSchema: sessionIdSchema,
},
{
name: 'computer_mouse_move',
description: 'Move the mouse cursor to x/y (screenshot pixel space).',
inputSchema: {
type: 'object',
properties: { sessionId: { type: 'string' }, x: { type: 'number' }, y: { type: 'number' } },
required: ['sessionId', 'x', 'y'],
},
},
{
name: 'computer_left_click',
description: 'Left-click. Optionally provide x/y to move there first.',
inputSchema: pointSchema,
},
{
name: 'computer_right_click',
description: 'Right-click. Optionally provide x/y to move there first.',
inputSchema: pointSchema,
},
{
name: 'computer_middle_click',
description: 'Middle-click. Optionally provide x/y to move there first.',
inputSchema: pointSchema,
},
{
name: 'computer_double_click',
description: 'Double-click. Optionally provide x/y to move there first.',
inputSchema: pointSchema,
},
{
name: 'computer_left_click_drag',
description: 'Press the left button at start coordinates and release at end coordinates (drag).',
description: 'Move the mouse cursor to an exact full-desktop screenshot coordinate. Call computer_screenshot first if you do not already have a current coordinate frame.',
inputSchema: {
type: 'object',
properties: {
sessionId: { type: 'string' },
startX: { type: 'number' }, startY: { type: 'number' },
endX: { type: 'number' }, endY: { type: 'number' },
sessionId: optionalSessionProperty,
x: { type: 'number', description: 'X coordinate in full-desktop screenshot pixels.' },
y: { type: 'number', description: 'Y coordinate in full-desktop screenshot pixels.' },
},
required: ['sessionId', 'startX', 'startY', 'endX', 'endY'],
required: ['x', 'y'],
},
},
{
name: 'computer_click',
description: 'Raw desktop click at the current cursor or at optional full-desktop screenshot coordinates. Prefer computer_click_element when the target app and element are known.',
inputSchema: {
type: 'object',
properties: {
sessionId: optionalSessionProperty,
x: { type: 'number', description: 'Optional X coordinate in full-desktop screenshot pixels.' },
y: { type: 'number', description: 'Optional Y coordinate in full-desktop screenshot pixels.' },
mouseButton: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Button for the click; omitted means left.' },
clickCount: { type: 'integer', description: 'How many times to click; omitted means 1.' },
},
},
},
{
name: 'computer_drag',
description: 'Raw desktop drag from start coordinates to end coordinates in full-desktop screenshot pixels. Prefer computer_app_drag for app-scoped drags when the target app is known.',
inputSchema: {
type: 'object',
properties: {
sessionId: optionalSessionProperty,
startX: { type: 'number', description: 'Start X coordinate in full-desktop screenshot pixels.' },
startY: { type: 'number', description: 'Start Y coordinate in full-desktop screenshot pixels.' },
endX: { type: 'number', description: 'End X coordinate in full-desktop screenshot pixels.' },
endY: { type: 'number', description: 'End Y coordinate in full-desktop screenshot pixels.' },
mouseButton: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Button to hold during the drag; omitted means left.' },
},
required: ['startX', 'startY', 'endX', 'endY'],
},
},
{
name: 'computer_type',
description: 'Type a string of text at the current focus.',
description: 'Type literal text at the current desktop focus. This is not app-scoped; use only after verifying the intended field is focused.',
inputSchema: {
type: 'object',
properties: { sessionId: { type: 'string' }, text: { type: 'string' } },
required: ['sessionId', 'text'],
properties: { sessionId: optionalSessionProperty, text: { type: 'string', description: 'Text to enter exactly as provided at current focus.' } },
required: ['text'],
},
},
{
name: 'computer_key',
description: 'Press a key or key chord using xdotool-style names, e.g. "Return", "Escape", "ctrl+a", "Page_Down".',
description: 'Press a key or key chord at the current desktop focus. This is not app-scoped; use computer_press_key when the target app is known.',
inputSchema: {
type: 'object',
properties: { sessionId: { type: 'string' }, key: { type: 'string' } },
required: ['sessionId', 'key'],
properties: { sessionId: optionalSessionProperty, key: { type: 'string', description: 'Key or chord, using names such as Return, Escape, Tab, ctrl+s, cmd+a, Up, or Page_Down.' } },
required: ['key'],
},
},
{
name: 'computer_scroll',
description: 'Scroll the mouse wheel. direction is up/down/left/right; amount is the number of steps. Optionally provide x/y to move there first.',
description: 'Raw desktop scroll at the current cursor or optional full-desktop screenshot coordinates. Prefer computer_scroll_element when the target app/element is known.',
inputSchema: {
type: 'object',
properties: {
sessionId: { type: 'string' },
direction: { type: 'string', enum: ['up', 'down', 'left', 'right'] },
amount: { type: 'number' },
x: { type: 'number' },
y: { type: 'number' },
sessionId: optionalSessionProperty,
direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Direction to scroll the desktop target.' },
amount: { type: 'number', description: 'Scroll amount in wheel/page-like units. Defaults are runtime-defined.' },
x: { type: 'number', description: 'Optional X coordinate in full-desktop screenshot pixels.' },
y: { type: 'number', description: 'Optional Y coordinate in full-desktop screenshot pixels.' },
},
required: ['sessionId', 'direction'],
required: ['direction'],
},
},
{
name: 'computer_wait',
description: 'Wait for a short period (milliseconds, max 10000) then return a fresh screenshot.',
description: 'Wait briefly, up to 10000 ms, then return an updated desktop screenshot. Use after actions that trigger loading, animation, or delayed UI changes.',
inputSchema: {
type: 'object',
properties: { sessionId: { type: 'string' }, timeoutMs: { type: 'number' } },
required: ['sessionId'],
properties: { sessionId: optionalSessionProperty, timeoutMs: { type: 'number', description: 'Milliseconds to wait. The runtime caps long waits.' } },
},
},
{
name: 'computer_close_session',
description: 'Stop a Computer Use session and revoke control.',
description: 'Stop the active auto-created Computer Use session, or the specified session, and revoke agent input control for that session.',
inputSchema: sessionIdSchema,
},
];
async function callTool(name: string, args: Record<string, unknown>) {
switch (name) {
case 'computer_create_session':
return toolResult(await callComputerUseApi(name, {}));
case 'computer_list_sessions':
return toolResult(await callComputerUseApi(name, {}));
case 'computer_app_drag':
case 'computer_click_element':
case 'computer_get_app_state':
case 'computer_list_apps':
case 'computer_perform_secondary_action':
case 'computer_press_key':
case 'computer_scroll_element':
case 'computer_set_value':
case 'computer_type_text':
return toolResult(await callComputerUseApi(name, args));
case 'computer_screenshot':
case 'computer_cursor_position':
case 'computer_close_session':
return toolResult(await callComputerUseApi(name, { sessionId: readString(args.sessionId, 'sessionId') }));
return toolResult(await callComputerUseApi(name, optionalSessionInput(args)));
case 'computer_mouse_move':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
x: readNumber(args.x),
y: readNumber(args.y),
}));
case 'computer_left_click':
case 'computer_right_click':
case 'computer_middle_click':
case 'computer_double_click':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
})));
case 'computer_click':
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
x: readNumber(args.x),
y: readNumber(args.y),
}));
case 'computer_left_click_drag':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
mouseButton: readMouseButton(args.mouseButton ?? args.mouse_button ?? args.button),
clickCount: readNumber(args.clickCount ?? args.click_count),
})));
case 'computer_drag':
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
startX: readNumber(args.startX),
startY: readNumber(args.startY),
endX: readNumber(args.endX),
endY: readNumber(args.endY),
}));
mouseButton: readMouseButton(args.mouseButton ?? args.mouse_button ?? args.button),
})));
case 'computer_type':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
text: readString(args.text, 'text'),
}));
})));
case 'computer_key':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
key: readString(args.key, 'key'),
}));
})));
case 'computer_scroll':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
direction: typeof args.direction === 'string' ? args.direction : 'up',
amount: readNumber(args.amount),
x: readNumber(args.x),
y: readNumber(args.y),
}));
})));
case 'computer_wait':
return toolResult(await callComputerUseApi(name, {
sessionId: readString(args.sessionId, 'sessionId'),
return toolResult(await callComputerUseApi(name, optionalSessionInput(args, {
timeoutMs: readNumber(args.timeoutMs),
}));
})));
default:
throw new Error(`Unknown tool: ${name}`);
}
@@ -299,6 +446,7 @@ async function handleMessage(message: JsonRpcRequest) {
protocolVersion: '2024-11-05',
capabilities: { tools: {} },
serverInfo: { name: 'cloudcli-computer-use', version: '1.0.0' },
instructions: computerUseInstructions,
};
}

View File

@@ -0,0 +1,67 @@
import {
captureScreenshot,
executor,
type ExecutorTarget,
} from '@/modules/computer-use/computer-executor.js';
import type { RawActionResult, RawComputerAction, RawActionTarget } from '@/modules/computer-use/actions/raw-action-types.js';
const DEFAULT_WAIT_MS = 1000;
const MAX_WAIT_MS = 10_000;
function normalizeWaitMs(ms: number | undefined): number {
if (ms === undefined) {
return DEFAULT_WAIT_MS;
}
if (!Number.isFinite(ms)) {
throw new Error('Computer Use wait duration must be a finite number.');
}
return Math.trunc(Math.max(0, Math.min(ms, MAX_WAIT_MS)));
}
async function snapshot(target: RawActionTarget): Promise<RawActionResult> {
const { dataUrl, size } = await captureScreenshot();
return { screenshotDataUrl: dataUrl, displaySize: size || target.displaySize };
}
export async function runRawComputerAction(
action: RawComputerAction,
target: RawActionTarget,
): Promise<RawActionResult> {
const executorTarget: ExecutorTarget = {
displaySize: target.displaySize,
};
switch (action.type) {
case 'screenshot':
return snapshot(target);
case 'cursor_position': {
const position = await executor.cursorPosition(executorTarget);
return { ...(await snapshot(target)), position, cursor: position };
}
case 'mouse_move':
await executor.moveTo(executorTarget, action.point);
return { ...(await snapshot(target)), cursor: action.point };
case 'click':
await executor.click(executorTarget, action.button, action.point, action.double === true);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'drag':
await executor.drag(executorTarget, action.from, action.to, action.button ?? 'left');
return { ...(await snapshot(target)), cursor: action.to };
case 'type':
await executor.type(action.text);
return snapshot(target);
case 'key':
await executor.pressChord(action.key);
return snapshot(target);
case 'scroll':
await executor.scroll(executorTarget, action.direction, action.amount ?? 3, action.point);
return { ...(await snapshot(target)), cursor: action.point ?? null };
case 'wait':
await new Promise((resolve) => setTimeout(resolve, normalizeWaitMs(action.ms)));
return snapshot(target);
default: {
const exhaustive: never = action;
throw new Error(`Unsupported computer action: ${(exhaustive as { type?: string }).type || 'unknown'}`);
}
}
}

View File

@@ -0,0 +1,28 @@
import type {
ClickButton,
DisplaySize,
Point,
ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
export type RawComputerAction =
| { type: 'screenshot' }
| { type: 'cursor_position' }
| { type: 'mouse_move'; point: Point }
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
| { type: 'type'; text: string }
| { type: 'key'; key: string }
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
| { type: 'wait'; ms?: number };
export type RawActionTarget = {
displaySize: DisplaySize | null;
};
export type RawActionResult = {
screenshotDataUrl?: string | null;
displaySize?: DisplaySize | null;
cursor?: Point | null;
position?: Point | null;
};

View File

@@ -0,0 +1,450 @@
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import {
captureScreenshot,
executor,
type ClickButton,
type ExecutorTarget,
type Point,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import type { SemanticAdapter } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import { createMacOsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/macos/macos-semantic-adapter.js';
import { createWindowsSemanticAdapter } from '@/modules/computer-use/semantics/adapters/windows/windows-semantic-adapter.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import { semanticSessionStore } from '@/modules/computer-use/semantics/semantic-session-store.js';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const execFileAsync = promisify(execFile);
const MAX_APP_STATE_ELEMENTS = 250;
let helperAdapter: SemanticAdapter | null | undefined;
function readString(value: unknown): string {
return typeof value === 'string' ? value.trim() : '';
}
function readNumber(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
}
function readButton(value: unknown): ClickButton {
return value === 'right' || value === 'middle' ? value : 'left';
}
function readClickCount(value: unknown): number {
const count = readNumber(value);
if (count === undefined) {
return 1;
}
return Math.max(1, Math.min(5, Math.trunc(count)));
}
function readDirection(value: unknown): ScrollDirection {
return value === 'up' || value === 'left' || value === 'right' ? value : 'down';
}
function readSessionId(input: Record<string, unknown>): string {
return readString(input.sessionId) || 'default';
}
function centerOf(element: SemanticElement): Point | null {
const bounds = element.bounds;
if (!bounds) {
return null;
}
return {
x: Math.round(bounds.x + bounds.width / 2),
y: Math.round(bounds.y + bounds.height / 2),
};
}
function getCachedElement(sessionId: string, app: string, index: string, stateId?: string): SemanticElement | null {
return semanticSessionStore.getElement(sessionId, app, index, stateId);
}
function getPoint(input: Record<string, unknown>, sessionId: string, app: string): Point | undefined {
const x = readNumber(input.x);
const y = readNumber(input.y);
if (x !== undefined && y !== undefined) {
return { x, y };
}
const elementIndex = readString(input.element_index);
if (!elementIndex) {
return undefined;
}
const element = getCachedElement(sessionId, app, elementIndex, readString(input.stateId) || undefined);
return element ? centerOf(element) || undefined : undefined;
}
function getHelperAdapter(): SemanticAdapter | null {
if (helperAdapter !== undefined) {
return helperAdapter;
}
if (process.platform !== 'darwin' && process.platform !== 'win32') {
helperAdapter = null;
return helperAdapter;
}
const resolution = resolveSemanticHelper();
if (!resolution.available) {
helperAdapter = null;
return helperAdapter;
}
helperAdapter = process.platform === 'darwin'
? createMacOsSemanticAdapter()
: createWindowsSemanticAdapter();
return helperAdapter;
}
function shouldFallbackFromHelper(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /not implemented|unavailable|not found|does not exist/i.test(message);
}
async function withHelperState(
sessionId: string,
operation: (adapter: SemanticAdapter) => Promise<SemanticAppState>,
): Promise<SemanticAppState | null> {
const adapter = getHelperAdapter();
if (!adapter) {
return null;
}
try {
return semanticSessionStore.save(sessionId, await operation(adapter));
} catch (error) {
if (shouldFallbackFromHelper(error)) {
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
return null;
}
throw error;
}
}
async function run(command: string, args: string[], timeout = 5000): Promise<string> {
const { stdout } = await execFileAsync(command, args, {
timeout,
windowsHide: true,
maxBuffer: 1024 * 1024 * 4,
});
return stdout;
}
async function listMacApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'tell application "System Events"',
'set appRows to {}',
'repeat with p in (application processes whose background only is false)',
'set end of appRows to (name of p as text)',
'end repeat',
'return appRows',
'end tell',
].join('\n');
const output = await run('osascript', ['-e', script]);
return output.split(', ')
.map((name) => name.trim())
.filter(Boolean)
.map((name) => ({ name, running: true }));
}
async function listWindowsApps(): Promise<Array<Record<string, unknown>>> {
const script = [
'Get-Process | Where-Object { $_.MainWindowTitle } |',
'Select-Object ProcessName, Id, MainWindowTitle | ConvertTo-Json -Depth 3',
].join(' ');
const output = await run('powershell.exe', ['-NoProfile', '-Command', script]);
const parsed = JSON.parse(output || '[]');
const rows = Array.isArray(parsed) ? parsed : [parsed];
return rows.map((row) => ({
name: row.ProcessName,
pid: row.Id,
windowTitle: row.MainWindowTitle,
running: true,
}));
}
async function listLinuxApps(): Promise<Array<Record<string, unknown>>> {
try {
const output = await run('wmctrl', ['-lx']);
return output.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.map((line) => {
const parts = line.split(/\s+/);
return {
windowId: parts[0],
desktop: parts[1],
host: parts[2],
className: parts[3],
windowTitle: parts.slice(4).join(' '),
running: true,
};
});
} catch {
const output = await run('ps', ['-eo', 'comm=']);
return [...new Set(output.split(/\r?\n/).map((name) => name.trim()).filter(Boolean))]
.slice(0, 200)
.map((name) => ({ name, running: true }));
}
}
async function listApps(): Promise<Array<Record<string, unknown>>> {
if (process.platform === 'darwin') {
return listMacApps();
}
if (process.platform === 'win32') {
return listWindowsApps();
}
return listLinuxApps();
}
async function macAccessibilityTree(app: string): Promise<SemanticElement[]> {
const escapedApp = app.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
const script = `
on safeText(v)
try
return v as text
on error
return ""
end try
end safeText
on emitElement(e, depth, maxDepth, counter)
if depth > maxDepth then return {}
set rows to {}
try
set roleText to my safeText(role of e)
on error
set roleText to "element"
end try
try
set titleText to my safeText(title of e)
on error
set titleText to ""
end try
try
set valueText to my safeText(value of e)
on error
set valueText to ""
end try
try
set posValue to position of e
set sizeValue to size of e
set boundsText to ((item 1 of posValue) as text) & "," & ((item 2 of posValue) as text) & "," & ((item 1 of sizeValue) as text) & "," & ((item 2 of sizeValue) as text)
on error
set boundsText to ""
end try
set end of rows to ((counter as text) & tab & roleText & tab & titleText & tab & valueText & tab & boundsText)
if counter > ${MAX_APP_STATE_ELEMENTS} then return rows
try
repeat with childElement in UI elements of e
set childRows to my emitElement(childElement, depth + 1, maxDepth, counter + (count of rows))
set rows to rows & childRows
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then return rows
end repeat
end try
return rows
end emitElement
tell application "System Events"
if not (exists process "${escapedApp}") then error "App is not running: ${escapedApp}"
tell process "${escapedApp}"
set rows to {}
repeat with w in windows
set rows to rows & my emitElement(w, 0, 4, (count of rows) + 1)
if (count of rows) > ${MAX_APP_STATE_ELEMENTS} then exit repeat
end repeat
return rows
end tell
end tell
`;
const output = await run('osascript', ['-e', script], 10000);
return output.split(/\r?\n|, /)
.map((line) => line.trim())
.filter(Boolean)
.map((line, index) => {
const [rawIndex, role, title, value, boundsText] = line.split('\t');
const boundsParts = (boundsText || '').split(',').map((part) => Number.parseFloat(part));
const hasBounds = boundsParts.length === 4 && boundsParts.every(Number.isFinite);
return {
index: rawIndex || String(index + 1),
role: role || 'element',
title: title || undefined,
value: value || undefined,
bounds: hasBounds
? { x: boundsParts[0], y: boundsParts[1], width: boundsParts[2], height: boundsParts[3] }
: undefined,
};
});
}
async function getAccessibilityTree(app: string): Promise<{ elements: SemanticElement[]; message?: string }> {
if (process.platform === 'darwin') {
try {
return { elements: await macAccessibilityTree(app) };
} catch (error) {
return { elements: [], message: error instanceof Error ? error.message : String(error) };
}
}
return {
elements: [],
message: 'Native accessibility tree capture is not implemented for this platform yet.',
};
}
async function getAppState(sessionId: string, app: string): Promise<SemanticAppState> {
if (!app) {
throw new Error('app is required.');
}
const helperState = await withHelperState(sessionId, (adapter) => adapter.getAppState({ sessionId, app }));
if (helperState) {
return helperState;
}
const screenshot = await captureScreenshot();
const tree = await getAccessibilityTree(app);
const state: SemanticAppState = {
stateId: semanticSessionStore.createStateId(),
app,
platform: process.platform,
screenshotDataUrl: screenshot.dataUrl,
displaySize: screenshot.size,
elements: tree.elements,
accessibilityTree: tree.elements,
message: tree.message,
};
return semanticSessionStore.save(sessionId, state);
}
async function targetFor(sessionId: string, app: string, stateId?: string): Promise<ExecutorTarget> {
const cached = semanticSessionStore.getState(sessionId, app, stateId);
return { displaySize: cached?.displaySize || (await captureScreenshot()).size };
}
export const computerSemanticsService = {
async callTool(name: string, input: Record<string, unknown>): Promise<unknown> {
const sessionId = readSessionId(input);
switch (name) {
case 'list_apps': {
const adapter = getHelperAdapter();
if (adapter) {
try {
return { apps: await adapter.listApps(), platform: process.platform };
} catch (error) {
if (!shouldFallbackFromHelper(error)) {
throw error;
}
console.warn('[ComputerSemantics] Falling back from helper:', error instanceof Error ? error.message : String(error));
}
}
return { apps: await listApps(), platform: process.platform };
}
case 'get_app_state':
return getAppState(sessionId, readString(input.app));
case 'click': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.clickElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('click requires x/y or an element_index from computer_get_app_state.');
}
const target = await targetFor(sessionId, app, stateId);
const button = readButton(input.mouse_button ?? input.mouseButton);
const clickCount = readClickCount(input.click_count ?? input.clickCount);
for (let index = 0; index < clickCount; index += 1) {
await executor.click(target, button, point, false);
}
return getAppState(sessionId, app);
}
case 'drag': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.drag({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const fromX = readNumber(input.from_x);
const fromY = readNumber(input.from_y);
const toX = readNumber(input.to_x);
const toY = readNumber(input.to_y);
if (fromX === undefined || fromY === undefined || toX === undefined || toY === undefined) {
throw new Error('drag requires from_x/from_y/to_x/to_y.');
}
await executor.drag(await targetFor(sessionId, app, stateId), { x: fromX, y: fromY }, { x: toX, y: toY }, readButton(input.mouse_button ?? input.mouseButton));
return getAppState(sessionId, app);
}
case 'scroll': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.scrollElement({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('scroll requires x/y or an element_index from computer_get_app_state.');
}
await executor.scroll(await targetFor(sessionId, app, stateId), readDirection(input.direction), readNumber(input.pages) ?? 1, point);
return getAppState(sessionId, app);
}
case 'type_text': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.typeText({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.type(readString(input.text));
return getAppState(sessionId, app);
}
case 'press_key': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.pressKey({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
await executor.pressChord(readString(input.key));
return getAppState(sessionId, app);
}
case 'set_value': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.setValue({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('set_value requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'left', point, false);
await executor.pressChord(process.platform === 'darwin' ? 'cmd+a' : 'ctrl+a');
await executor.type(readString(input.value));
return getAppState(sessionId, app);
}
case 'perform_secondary_action': {
const app = readString(input.app);
const helperState = await withHelperState(sessionId, (adapter) => adapter.performSecondaryAction({ ...input, sessionId, app }));
if (helperState) {
return helperState;
}
const stateId = readString(input.stateId) || undefined;
const point = getPoint(input, sessionId, app);
if (!point) {
throw new Error('perform_secondary_action requires x/y or an element_index from computer_get_app_state.');
}
await executor.click(await targetFor(sessionId, app, stateId), 'right', point, false);
return getAppState(sessionId, app);
}
default:
throw new Error(`Unknown semantic Computer Use tool: ${name}`);
}
},
};

View File

@@ -1,6 +1,7 @@
import express from 'express';
import { computerUseService } from '@/modules/computer-use/computer-use.service.js';
import { semanticOperationForMcpTool } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const router = express.Router();
@@ -36,6 +37,22 @@ function point(input: Record<string, unknown>): { x: number; y: number } | undef
: undefined;
}
function requireNumber(input: Record<string, unknown>, name: string): number {
const value = input[name];
if (typeof value !== 'number' || !Number.isFinite(value)) {
throw new Error(`${name} is required and must be a finite number.`);
}
return value;
}
function requirePoint(input: Record<string, unknown>): { x: number; y: number } {
return { x: requireNumber(input, 'x'), y: requireNumber(input, 'y') };
}
function requireNamedPoint(input: Record<string, unknown>, xName: string, yName: string): { x: number; y: number } {
return { x: requireNumber(input, xName), y: requireNumber(input, yName) };
}
router.use((req, res, next) => {
const expected = computerUseService.getMcpToken();
const token = readBearerToken(req.headers.authorization) || String(req.headers['x-computer-use-mcp-token'] || '');
@@ -49,17 +66,18 @@ router.use((req, res, next) => {
router.post('/tools/:toolName', async (req, res) => {
try {
const input = (req.body && typeof req.body === 'object' ? req.body : {}) as Record<string, unknown>;
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : '';
const sessionId = typeof input.sessionId === 'string' ? input.sessionId : undefined;
const toolName = req.params.toolName;
const semanticOperation = semanticOperationForMcpTool(toolName);
let result: unknown;
if (semanticOperation) {
result = await computerUseService.callSemanticTool(semanticOperation, input);
res.json({ success: true, data: result });
return;
}
switch (toolName) {
case 'computer_create_session':
result = await computerUseService.createAgentSession();
break;
case 'computer_list_sessions':
result = await computerUseService.listAgentSessions();
break;
case 'computer_screenshot':
result = await computerUseService.agentScreenshot(sessionId);
break;
@@ -67,28 +85,23 @@ router.post('/tools/:toolName', async (req, res) => {
result = await computerUseService.agentCursorPosition(sessionId);
break;
case 'computer_mouse_move':
result = await computerUseService.agentMouseMove(sessionId, point(input) || { x: 0, y: 0 });
result = await computerUseService.agentMouseMove(sessionId, requirePoint(input));
break;
case 'computer_left_click':
result = await computerUseService.agentClick(sessionId, 'left', point(input));
case 'computer_click':
result = await computerUseService.agentUnifiedClick(sessionId, {
button: toButton(input.mouseButton ?? input.mouse_button ?? input.button),
point: point(input),
clickCount: typeof input.clickCount === 'number'
? input.clickCount
: typeof input.click_count === 'number'
? input.click_count
: 1,
});
break;
case 'computer_right_click':
result = await computerUseService.agentClick(sessionId, 'right', point(input));
break;
case 'computer_middle_click':
result = await computerUseService.agentClick(sessionId, 'middle', point(input));
break;
case 'computer_double_click':
result = await computerUseService.agentClick(sessionId, toButton(input.button), point(input), true);
break;
case 'computer_left_click_drag': {
const from = typeof input.startX === 'number' && typeof input.startY === 'number'
? { x: input.startX, y: input.startY }
: { x: 0, y: 0 };
const to = typeof input.endX === 'number' && typeof input.endY === 'number'
? { x: input.endX, y: input.endY }
: { x: 0, y: 0 };
result = await computerUseService.agentDrag(sessionId, from, to, 'left');
case 'computer_drag': {
const from = requireNamedPoint(input, 'startX', 'startY');
const to = requireNamedPoint(input, 'endX', 'endY');
result = await computerUseService.agentDrag(sessionId, from, to, toButton(input.mouseButton ?? input.mouse_button ?? input.button));
break;
}
case 'computer_type':

View File

@@ -56,43 +56,34 @@ router.get('/status', async (_req, res) => {
}
});
router.get('/settings', async (_req, res) => {
router.get('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
res.json({ success: true, data: { settings: await computerUseService.getSettings() } });
} catch (error) {
res.status(500).json({
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to load Computer Use settings.',
});
}
});
router.put('/settings', async (req, res) => {
router.put('/settings', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const settings = await computerUseService.updateSettings(req.body || {});
res.json({ success: true, data: { settings } });
} catch (error) {
res.status(400).json({
res.status(getErrorStatusCode(error, 400)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to save Computer Use settings.',
});
}
});
router.post('/agent-tools/register', async (_req, res) => {
try {
const result = await computerUseService.registerAgentMcp();
res.status(201).json({ success: true, data: result });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to register Computer Use MCP.',
});
}
});
router.post('/runtime/install', async (_req, res) => {
router.post('/runtime/install', async (req: AuthenticatedRequest, res) => {
try {
requireUser(req);
const result = await computerUseService.installRuntime();
res.status(result.success ? 200 : 500).json({
success: result.success,
@@ -100,7 +91,7 @@ router.post('/runtime/install', async (_req, res) => {
error: result.success ? undefined : result.message,
});
} catch (error) {
res.status(500).json({
res.status(getErrorStatusCode(error, 500)).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to install Computer Use runtime.',
});
@@ -118,18 +109,6 @@ router.get('/sessions', async (req: AuthenticatedRequest, res) => {
}
});
router.post('/sessions', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.createSession(requireUser(req));
res.status(session.status === 'unavailable' ? 202 : 201).json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to create Computer Use session.',
});
}
});
router.post('/sessions/:sessionId/screenshot', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userScreenshot(requireUser(req), readParam(req.params.sessionId));
@@ -169,18 +148,6 @@ router.post('/sessions/:sessionId/click', async (req: AuthenticatedRequest, res)
}
});
router.post('/sessions/:sessionId/type', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userType(requireUser(req), readParam(req.params.sessionId), String(req.body?.text || ''));
res.json({ success: true, data: { session } });
} catch (error) {
res.status(400).json({
success: false,
error: error instanceof Error ? error.message : 'Failed to type text.',
});
}
});
router.post('/sessions/:sessionId/press-key', async (req: AuthenticatedRequest, res) => {
try {
const session = await computerUseService.userPressKey(requireUser(req), readParam(req.params.sessionId), String(req.body?.key || ''));

View File

@@ -7,14 +7,16 @@ import { appConfigDb } from '@/modules/database/repositories/app-config.js';
import { providerMcpService } from '@/modules/providers/services/mcp.service.js';
import { getModuleDir } from '@/utils/runtime-paths.js';
import {
executor,
captureScreenshot as captureScreenshotRuntime,
getRuntimeReadiness as getExecutorReadiness,
type Point,
type ClickButton,
type ScrollDirection,
} from '@/modules/computer-use/computer-executor.js';
import { runRawComputerAction } from '@/modules/computer-use/actions/raw-action-dispatcher.js';
import type { RawComputerAction } from '@/modules/computer-use/actions/raw-action-types.js';
import { desktopAgentRelay } from '@/modules/computer-use/desktop-agent-relay.service.js';
import { computerSemanticsService } from '@/modules/computer-use/computer-semantics.service.js';
import { semanticOperationNames } from '@/modules/computer-use/semantics/semantic-tool-dispatcher.js';
const __dirname = getModuleDir(import.meta.url);
const IS_PLATFORM = process.env.VITE_IS_PLATFORM === 'true';
@@ -22,9 +24,6 @@ const MAX_SESSIONS_PER_OWNER = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE
const SESSION_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_USE_SESSION_TTL_MS || String(30 * 60 * 1000), 10);
const COMPUTER_USE_SETTINGS_KEY = 'computer_use_settings';
const COMPUTER_USE_MCP_TOKEN_KEY = 'computer_use_mcp_token';
const DEFAULT_AGENT_WAIT_MS = 1000;
const MAX_AGENT_WAIT_MS = 10_000;
type ComputerUseRuntime = 'cloud' | 'local';
type ComputerUseSessionStatus = 'ready' | 'stopped' | 'unavailable';
@@ -61,7 +60,6 @@ type ComputerUseOwner = {
type ComputerUseSettings = {
enabled: boolean;
agentToolsEnabled: boolean;
};
type RuntimeReadiness = {
@@ -79,7 +77,6 @@ let lastInstallMessage: string | null = null;
const DEFAULT_SETTINGS: ComputerUseSettings = {
enabled: false,
agentToolsEnabled: false,
};
const AGENT_OWNER_ID = 'agent';
const MCP_SERVER_NAME = 'cloudcli-computer-use';
@@ -99,7 +96,6 @@ function readSettings(): ComputerUseSettings {
const parsed = JSON.parse(raw) as Partial<ComputerUseSettings>;
return {
enabled: parsed.enabled === true,
agentToolsEnabled: parsed.agentToolsEnabled === true,
};
} catch (error: any) {
console.warn('[Computer Use] Failed to read settings:', error?.message || error);
@@ -110,7 +106,6 @@ function readSettings(): ComputerUseSettings {
function writeSettings(settings: ComputerUseSettings): ComputerUseSettings {
const normalized = {
enabled: settings.enabled === true,
agentToolsEnabled: settings.agentToolsEnabled === true,
};
appConfigDb.set(COMPUTER_USE_SETTINGS_KEY, JSON.stringify(normalized));
@@ -274,6 +269,20 @@ function canAccessSession(ownerId: string, session: ComputerUseSession): boolean
return session.ownerId === ownerId || session.ownerId === AGENT_OWNER_ID;
}
function normalizeSessionId(sessionId?: string | null): string | null {
if (typeof sessionId !== 'string') {
return null;
}
const trimmed = sessionId.trim();
return trimmed ? trimmed : null;
}
function findActiveAgentSession(): ComputerUseSession | null {
return ownerSessions(AGENT_OWNER_ID)
.filter((session) => session.status === 'ready')
.sort((a, b) => Date.parse(b.updatedAt) - Date.parse(a.updatedAt))[0] || null;
}
async function expireStaleSessions(now = Date.now()): Promise<void> {
for (const session of sessions.values()) {
if (session.status !== 'ready') {
@@ -301,17 +310,6 @@ async function expireStaleSessions(now = Date.now()): Promise<void> {
// `desktopAgentRelay` and applies the returned screenshot. The local server
// itself never touches the OS in cloud mode.
/** One desktop interaction expressed in screenshot-pixel coordinate space. */
export type ComputerAction =
| { type: 'screenshot' }
| { type: 'mouse_move'; point: Point }
| { type: 'click'; button: ClickButton; point?: Point; double?: boolean }
| { type: 'drag'; from: Point; to: Point; button?: ClickButton }
| { type: 'type'; text: string }
| { type: 'key'; key: string }
| { type: 'scroll'; direction: ScrollDirection; amount?: number; point?: Point }
| { type: 'wait'; ms?: number };
/** Shape the desktop agent returns for any relayed action. */
type RelayResult = {
screenshotDataUrl?: string | null;
@@ -333,14 +331,9 @@ function applyRelayResult(session: ComputerUseSession, result: RelayResult): voi
session.updatedAt = new Date().toISOString();
}
function normalizeAgentWaitMs(ms: number | undefined): number {
if (ms === undefined) {
return DEFAULT_AGENT_WAIT_MS;
}
if (!Number.isFinite(ms)) {
throw new Error('Computer Use wait duration must be a finite number.');
}
return Math.trunc(Math.max(0, Math.min(ms, MAX_AGENT_WAIT_MS)));
function stripSessionArgs(args: Record<string, unknown>): Record<string, unknown> {
const { sessionId: _sessionId, ...toolArgs } = args;
return toolArgs;
}
async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
@@ -349,16 +342,11 @@ async function refreshScreenshot(session: ComputerUseSession): Promise<void> {
applyRelayResult(session, result);
return;
}
const { dataUrl, size } = await captureScreenshotRuntime();
session.screenshotDataUrl = dataUrl;
if (size) {
session.displaySize = size;
}
session.updatedAt = new Date().toISOString();
applyRelayResult(session, await runRawComputerAction({ type: 'screenshot' }, session));
}
/** Runs one action and refreshes the session screenshot afterwards. */
async function performAction(session: ComputerUseSession, action: ComputerAction): Promise<void> {
async function performAction(session: ComputerUseSession, action: RawComputerAction): Promise<void> {
if (getRuntime() === 'cloud') {
const result = (await desktopAgentRelay.relay(action.type, {
...action,
@@ -369,32 +357,7 @@ async function performAction(session: ComputerUseSession, action: ComputerAction
return;
}
switch (action.type) {
case 'screenshot':
break;
case 'mouse_move':
await executor.moveTo(session, action.point);
break;
case 'click':
await executor.click(session, action.button, action.point, action.double === true);
break;
case 'drag':
await executor.drag(session, action.from, action.to, action.button ?? 'left');
break;
case 'type':
await executor.type(action.text);
break;
case 'key':
await executor.pressChord(action.key);
break;
case 'scroll':
await executor.scroll(session, action.direction, action.amount ?? 3, action.point);
break;
case 'wait':
await new Promise((resolve) => setTimeout(resolve, normalizeAgentWaitMs(action.ms)));
break;
}
await refreshScreenshot(session);
applyRelayResult(session, await runRawComputerAction(action, session));
}
/** Reads the current cursor position in screenshot-pixel space. */
@@ -410,7 +373,9 @@ async function getCursorPosition(session: ComputerUseSession): Promise<Point> {
}
return session.cursor ? { x: session.cursor.x, y: session.cursor.y } : { x: 0, y: 0 };
}
return executor.cursorPosition(session);
const result = await runRawComputerAction({ type: 'cursor_position' }, session);
applyRelayResult(session, result);
return result.position || session.cursor || { x: 0, y: 0 };
}
function assertReady(session: ComputerUseSession): void {
@@ -421,14 +386,14 @@ function assertReady(session: ComputerUseSession): void {
/**
* Whether agent tools may operate right now. Cloud mode depends purely on a
* connected desktop agent; local mode depends on the two opt-in settings.
* connected desktop agent; local mode depends on the single feature setting.
*/
function agentToolsAvailable(): boolean {
if (getRuntime() === 'cloud') {
return desktopAgentRelay.isConnected();
}
const settings = readSettings();
return settings.enabled && settings.agentToolsEnabled;
return settings.enabled;
}
function assertAgentToolsAvailable(): void {
@@ -450,21 +415,10 @@ export const computerUseService = {
async updateSettings(settings: Partial<ComputerUseSettings>) {
const current = readSettings();
const enabled = typeof settings.enabled === 'boolean' ? settings.enabled : current.enabled;
const nextSettings = {
...current,
enabled,
agentToolsEnabled: typeof settings.agentToolsEnabled === 'boolean'
? settings.agentToolsEnabled
: enabled,
};
if (!nextSettings.enabled) {
nextSettings.agentToolsEnabled = false;
}
const next = writeSettings(nextSettings);
if (next.agentToolsEnabled) {
const next = writeSettings({ enabled });
if (next.enabled) {
await this.registerAgentMcp();
} else if (current.agentToolsEnabled) {
} else if (current.enabled) {
await this.unregisterAgentMcp();
}
return next;
@@ -487,14 +441,11 @@ export const computerUseService = {
enabled: isCloud ? true : settings.enabled,
runtime: getRuntime(),
available,
requiresDesktopBridge: isCloud,
desktopAgentConnected,
nutInstalled: readiness.nutInstalled,
screenshotInstalled: readiness.screenshotInstalled,
installInProgress: readiness.installInProgress,
sessionCount: sessions.size,
agentToolsEnabled: isCloud ? desktopAgentConnected : settings.agentToolsEnabled,
mcpRecommended: !settings.agentToolsEnabled,
message: available ? 'Computer Use runtime is available.' : getSetupMessage(settings, readiness),
};
},
@@ -704,18 +655,6 @@ export const computerUseService = {
return publicSession(session);
},
async userType(owner: ComputerUseOwner, sessionId: string, text: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
if (!session || !canAccessSession(ownerId, session)) {
throw new Error('Computer Use session not found.');
}
assertReady(session);
await performAction(session, { type: 'type', text });
session.lastAction = 'type';
return publicSession(session);
},
async userPressKey(owner: ComputerUseOwner, sessionId: string, key: string) {
const ownerId = getOwnerId(owner);
const session = sessions.get(sessionId);
@@ -730,46 +669,52 @@ export const computerUseService = {
// --- Agent-initiated actions (via MCP) ------------------------------------
async createAgentSession() {
assertAgentToolsAvailable();
return this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
},
async listAgentSessions() {
if (!agentToolsAvailable()) {
return [];
}
await expireStaleSessions();
return [...sessions.values()].map(publicSession);
},
/**
* Resolves a session the agent is allowed to act on. In local mode this
* enforces the in-process per-session consent flag. In cloud mode the linked
* desktop agent is the consent authority (it prompts the user per its own
* consent mode), so this only requires the relay to be connected.
*/
async getConsentedSession(sessionId: string): Promise<ComputerUseSession> {
async getOrCreateAgentSession(): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
const session = sessions.get(sessionId);
await expireStaleSessions();
const existing = findActiveAgentSession();
if (existing) {
return existing;
}
const created = await this.createSession({ id: AGENT_OWNER_ID }, { createdBy: 'agent' });
const session = sessions.get(created.id);
if (!session) {
throw new Error('Computer Use session could not be created.');
}
return session;
},
async getConsentedSession(sessionId?: string): Promise<ComputerUseSession> {
assertAgentToolsAvailable();
const normalizedSessionId = normalizeSessionId(sessionId);
const session = normalizedSessionId
? sessions.get(normalizedSessionId)
: await this.getOrCreateAgentSession();
if (!session) {
throw new Error('Computer Use session not found.');
}
if (getRuntime() !== 'cloud' && !session.agentAccessEnabled) {
throw new Error('Computer Use session is awaiting user consent. Ask the user to grant control in the Computer panel.');
throw new Error(`Computer Use session ${session.id} is awaiting user consent. Ask the user to grant control in the Computer panel.`);
}
assertReady(session);
return session;
},
async agentScreenshot(sessionId: string) {
async agentScreenshot(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
await refreshScreenshot(session);
session.lastAction = 'screenshot';
return publicSession(session);
},
async agentCursorPosition(sessionId: string) {
async agentCursorPosition(sessionId?: string) {
const session = await this.getConsentedSession(sessionId);
const point = await getCursorPosition(session);
session.cursor = { ...point, actor: 'agent' };
@@ -777,7 +722,7 @@ export const computerUseService = {
return { session: publicSession(session), position: point };
},
async agentMouseMove(sessionId: string, point: Point) {
async agentMouseMove(sessionId: string | undefined, point: Point) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'mouse_move', point });
session.cursor = { ...point, actor: 'agent' };
@@ -785,39 +730,43 @@ export const computerUseService = {
return publicSession(session);
},
async agentClick(sessionId: string, button: ClickButton, point?: Point, doubleClick = false) {
async agentUnifiedClick(sessionId: string | undefined, input: { button?: ClickButton; point?: Point; clickCount?: number }) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'click', button, point, double: doubleClick });
if (point) {
session.cursor = { ...point, actor: 'agent' };
const button = input.button || 'left';
const clickCount = Math.max(1, Math.min(Math.trunc(input.clickCount || 1), 5));
for (let index = 0; index < clickCount; index += 1) {
await performAction(session, { type: 'click', button, point: input.point, double: false });
}
session.lastAction = doubleClick ? 'double_click' : `${button}_click`;
if (input.point) {
session.cursor = { ...input.point, actor: 'agent' };
}
session.lastAction = clickCount > 1 ? `${button}_click:${clickCount}` : `${button}_click`;
return publicSession(session);
},
async agentDrag(sessionId: string, from: Point, to: Point, button: ClickButton = 'left') {
async agentDrag(sessionId: string | undefined, from: Point, to: Point, button: ClickButton = 'left') {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'drag', from, to, button });
session.cursor = { ...to, actor: 'agent' };
session.lastAction = 'left_click_drag';
session.lastAction = `${button}_drag`;
return publicSession(session);
},
async agentType(sessionId: string, text: string) {
async agentType(sessionId: string | undefined, text: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'type', text });
session.lastAction = 'type';
return publicSession(session);
},
async agentKey(sessionId: string, key: string) {
async agentKey(sessionId: string | undefined, key: string) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'key', key });
session.lastAction = `key:${key}`;
return publicSession(session);
},
async agentScroll(sessionId: string, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
async agentScroll(sessionId: string | undefined, input: { direction: ScrollDirection; amount?: number; x?: number; y?: number }) {
const session = await this.getConsentedSession(sessionId);
const point = typeof input.x === 'number' && typeof input.y === 'number' ? { x: input.x, y: input.y } : undefined;
await performAction(session, { type: 'scroll', direction: input.direction, amount: input.amount, point });
@@ -828,16 +777,48 @@ export const computerUseService = {
return publicSession(session);
},
async agentWait(sessionId: string, timeoutMs?: number) {
async agentWait(sessionId?: string, timeoutMs?: number) {
const session = await this.getConsentedSession(sessionId);
await performAction(session, { type: 'wait', ms: timeoutMs });
session.lastAction = 'wait';
return publicSession(session);
},
async agentStopSession(sessionId: string) {
async agentStopSession(sessionId?: string) {
assertAgentToolsAvailable();
return this.stopSession({ id: AGENT_OWNER_ID }, sessionId);
const normalizedSessionId = normalizeSessionId(sessionId);
if (normalizedSessionId) {
return this.stopSession({ id: AGENT_OWNER_ID }, normalizedSessionId);
}
await expireStaleSessions();
const existing = findActiveAgentSession();
if (!existing) {
return { stopped: false };
}
return this.stopSession({ id: AGENT_OWNER_ID }, existing.id);
},
async callSemanticTool(toolName: string, args: Record<string, unknown>) {
if (!semanticOperationNames.has(toolName)) {
throw new Error(`Unsupported semantic Computer Use tool: ${toolName}`);
}
const sessionId = typeof args.sessionId === 'string' ? args.sessionId : undefined;
const session = await this.getConsentedSession(normalizeSessionId(sessionId) ?? undefined);
const toolArgs = { ...stripSessionArgs(args), sessionId: session.id };
const semanticResult = getRuntime() === 'cloud'
? await desktopAgentRelay.relay('semantic_tool', {
sessionId: session.id,
displaySize: session.displaySize,
toolName,
arguments: toolArgs,
})
: await computerSemanticsService.callTool(toolName, toolArgs);
applyRelayResult(session, semanticResult as RelayResult);
session.lastAction = `semantic:${toolName}`;
return { session: publicSession(session), result: semanticResult };
},
/**

View File

@@ -0,0 +1,82 @@
import { SemanticHelperProcess } from '@/modules/computer-use/semantics/helpers/semantic-helper-process.js';
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
import type { SemanticAdapter, SemanticAdapterCapabilities } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
type HelperMethod =
| 'list_apps'
| 'get_app_state'
| 'click_element'
| 'perform_secondary_action'
| 'set_value'
| 'type_text'
| 'press_key'
| 'scroll_element'
| 'drag';
export class HelperSemanticAdapter implements SemanticAdapter {
private helper: SemanticHelperProcess | null = null;
constructor(
private readonly platform: NodeJS.Platform,
private readonly arch: NodeJS.Architecture = process.arch,
) {}
capabilities(): SemanticAdapterCapabilities {
return {
platform: this.platform,
appDiscovery: true,
accessibilityTree: true,
nativeElementActions: true,
nativeValueSetting: true,
targetedInput: true,
};
}
async listApps(): Promise<SemanticApp[]> {
return await this.request('list_apps', {}) as SemanticApp[];
}
async getAppState(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('get_app_state', input) as SemanticAppState;
}
async clickElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('click_element', input) as SemanticAppState;
}
async performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('perform_secondary_action', input) as SemanticAppState;
}
async setValue(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('set_value', input) as SemanticAppState;
}
async typeText(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('type_text', input) as SemanticAppState;
}
async pressKey(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('press_key', input) as SemanticAppState;
}
async scrollElement(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('scroll_element', input) as SemanticAppState;
}
async drag(input: SemanticToolInput): Promise<SemanticAppState> {
return await this.request('drag', input) as SemanticAppState;
}
private async request(method: HelperMethod, params: Record<string, unknown>): Promise<unknown> {
if (!this.helper) {
const resolution = resolveSemanticHelper(this.platform, this.arch);
if (!resolution.available || !resolution.path) {
throw new Error(resolution.reason || `Semantic helper is unavailable for ${this.platform}-${this.arch}.`);
}
this.helper = new SemanticHelperProcess(resolution.path);
}
return this.helper.request(method, params);
}
}

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createMacOsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('darwin');
}

View File

@@ -0,0 +1,23 @@
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
export type SemanticAdapterCapabilities = {
platform: NodeJS.Platform;
appDiscovery: boolean;
accessibilityTree: boolean;
nativeElementActions: boolean;
nativeValueSetting: boolean;
targetedInput: boolean;
};
export type SemanticAdapter = {
capabilities(): SemanticAdapterCapabilities;
listApps(): Promise<SemanticApp[]>;
getAppState(input: SemanticToolInput): Promise<SemanticAppState>;
clickElement(input: SemanticToolInput): Promise<SemanticAppState>;
performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState>;
setValue(input: SemanticToolInput): Promise<SemanticAppState>;
typeText(input: SemanticToolInput): Promise<SemanticAppState>;
pressKey(input: SemanticToolInput): Promise<SemanticAppState>;
scrollElement(input: SemanticToolInput): Promise<SemanticAppState>;
drag(input: SemanticToolInput): Promise<SemanticAppState>;
};

View File

@@ -0,0 +1,5 @@
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
export function createWindowsSemanticAdapter(): HelperSemanticAdapter {
return new HelperSemanticAdapter('win32');
}

View File

@@ -0,0 +1,437 @@
import AppKit
import ApplicationServices
import Foundation
typealias JSON = [String: Any]
struct ElementRecord {
let index: String
let role: String
let title: String?
let value: String?
let bounds: [String: Double]?
let actions: [String]
}
var stateElements: [String: [ElementRecord]] = [:]
var stateAxElements: [String: [String: AXUIElement]] = [:]
func jsonLine(_ object: Any) {
guard JSONSerialization.isValidJSONObject(object),
let data = try? JSONSerialization.data(withJSONObject: object),
let text = String(data: data, encoding: .utf8)
else {
print("{\"error\":\"Failed to encode JSON\"}")
fflush(stdout)
return
}
print(text)
fflush(stdout)
}
func respond(id: Any?, result: Any) {
jsonLine(["id": id ?? NSNull(), "result": result])
}
func respondError(id: Any?, _ message: String) {
jsonLine(["id": id ?? NSNull(), "error": message])
}
func stringAttr(_ element: AXUIElement, _ attr: CFString) -> String? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? String
}
func boolAttr(_ element: AXUIElement, _ attr: CFString) -> Bool? {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
return value as? Bool
}
func arrayAttr(_ element: AXUIElement, _ attr: CFString) -> [AXUIElement] {
var value: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return [] }
return value as? [AXUIElement] ?? []
}
func actions(_ element: AXUIElement) -> [String] {
var names: CFArray?
guard AXUIElementCopyActionNames(element, &names) == .success else { return [] }
return names as? [String] ?? []
}
func bounds(_ element: AXUIElement) -> [String: Double]? {
var positionRef: CFTypeRef?
var sizeRef: CFTypeRef?
guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionRef) == .success,
AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeRef) == .success,
let positionValue = positionRef,
let sizeValue = sizeRef
else { return nil }
var point = CGPoint.zero
var size = CGSize.zero
guard AXValueGetValue(positionValue as! AXValue, .cgPoint, &point),
AXValueGetValue(sizeValue as! AXValue, .cgSize, &size)
else { return nil }
return [
"x": Double(point.x),
"y": Double(point.y),
"width": Double(size.width),
"height": Double(size.height),
]
}
func record(_ element: AXUIElement, index: String) -> ElementRecord {
ElementRecord(
index: index,
role: stringAttr(element, kAXRoleAttribute as CFString) ?? "AXUnknown",
title: stringAttr(element, kAXTitleAttribute as CFString) ?? stringAttr(element, kAXDescriptionAttribute as CFString),
value: stringAttr(element, kAXValueAttribute as CFString),
bounds: bounds(element),
actions: actions(element)
)
}
func cachedElement(_ params: JSON) -> AXUIElement? {
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String
else {
return nil
}
return stateAxElements[stateId]?[elementIndex]
}
func dictionary(_ record: ElementRecord) -> JSON {
var output: JSON = [
"index": record.index,
"role": record.role,
"actions": record.actions,
]
if let title = record.title { output["title"] = title }
if let value = record.value { output["value"] = value }
if let bounds = record.bounds { output["bounds"] = bounds }
return output
}
func resolveApp(_ query: String) throws -> NSRunningApplication {
let normalized = query.lowercased()
let apps = NSWorkspace.shared.runningApplications.filter { app in
app.activationPolicy == .regular
}
if let app = apps.first(where: { $0.bundleIdentifier?.lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased() == normalized }) {
return app
}
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased().contains(normalized) }) {
return app
}
throw NSError(domain: "CloudCLISemantics", code: 404, userInfo: [NSLocalizedDescriptionKey: "App is not running: \(query)"])
}
func listApps() -> [[String: Any]] {
NSWorkspace.shared.runningApplications
.filter { $0.activationPolicy == .regular }
.map { app in
[
"id": app.bundleIdentifier ?? app.localizedName ?? "\(app.processIdentifier)",
"name": app.localizedName ?? app.bundleIdentifier ?? "Unknown",
"bundleIdentifier": app.bundleIdentifier ?? "",
"pid": Int(app.processIdentifier),
"running": true,
]
}
}
func walk(_ element: AXUIElement, depth: Int, maxDepth: Int, records: inout [ElementRecord], axRecords: inout [String: AXUIElement], limit: Int) {
if depth > maxDepth || records.count >= limit { return }
let index = "\(records.count + 1)"
records.append(record(element, index: index))
axRecords[index] = element
for child in arrayAttr(element, kAXChildrenAttribute as CFString) {
walk(child, depth: depth + 1, maxDepth: maxDepth, records: &records, axRecords: &axRecords, limit: limit)
if records.count >= limit { return }
}
}
func pngDataUrlForMainDisplay() -> String? {
guard let image = CGDisplayCreateImage(CGMainDisplayID()) else { return nil }
let bitmap = NSBitmapImageRep(cgImage: image)
guard let png = bitmap.representation(using: .png, properties: [:]) else { return nil }
return "data:image/png;base64,\(png.base64EncodedString())"
}
func getAppState(_ params: JSON) throws -> JSON {
let appName = params["app"] as? String ?? ""
let app = try resolveApp(appName)
let axApp = AXUIElementCreateApplication(app.processIdentifier)
let windows = arrayAttr(axApp, kAXWindowsAttribute as CFString)
let root = windows.first ?? axApp
var records: [ElementRecord] = []
var axRecords: [String: AXUIElement] = [:]
walk(root, depth: 0, maxDepth: 5, records: &records, axRecords: &axRecords, limit: 300)
let stateId = "state_\(UUID().uuidString)"
stateElements[stateId] = records
stateAxElements[stateId] = axRecords
let elements = records.map(dictionary)
return [
"stateId": stateId,
"app": app.localizedName ?? app.bundleIdentifier ?? appName,
"platform": "darwin",
"screenshotDataUrl": pngDataUrlForMainDisplay() ?? NSNull(),
"displaySize": [
"width": Int(CGDisplayPixelsWide(CGMainDisplayID())),
"height": Int(CGDisplayPixelsHigh(CGMainDisplayID())),
],
"elements": elements,
"accessibilityTree": elements,
"treeText": elements.map { "\($0["index"] ?? "") \($0["role"] ?? "") \($0["title"] ?? "")" }.joined(separator: "\n"),
]
}
func cgMouseButton(_ value: Any?) -> CGMouseButton {
guard let button = value as? String else { return .left }
switch button {
case "right": return .right
case "middle": return .center
default: return .left
}
}
func mouseEventTypes(_ button: CGMouseButton) -> (CGEventType, CGEventType) {
switch button {
case .right: return (.rightMouseDown, .rightMouseUp)
case .center: return (.otherMouseDown, .otherMouseUp)
default: return (.leftMouseDown, .leftMouseUp)
}
}
func postMouseClick(point: CGPoint, button: CGMouseButton, clickCount: Int = 1) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
for _ in 0..<max(1, clickCount) {
let down = CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: point, mouseButton: button)
let up = CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: point, mouseButton: button)
down?.post(tap: .cghidEventTap)
up?.post(tap: .cghidEventTap)
usleep(80_000)
}
}
func postDrag(from: CGPoint, to: CGPoint, button: CGMouseButton) throws {
guard let source = CGEventSource(stateID: .combinedSessionState) else {
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
}
let eventTypes = mouseEventTypes(button)
CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: from, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
usleep(80_000)
CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
}
func runAppleScript(_ script: String) throws {
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/bin/osascript")
process.arguments = ["-e", script]
process.standardOutput = Pipe()
let stderr = Pipe()
process.standardError = stderr
try process.run()
process.waitUntilExit()
if process.terminationStatus != 0 {
let data = stderr.fileHandleForReading.readDataToEndOfFile()
let message = String(data: data, encoding: .utf8) ?? "AppleScript failed."
throw NSError(domain: "CloudCLISemantics", code: Int(process.terminationStatus), userInfo: [NSLocalizedDescriptionKey: message])
}
}
func escapedAppleScriptString(_ value: String) -> String {
value.replacingOccurrences(of: "\\", with: "\\\\").replacingOccurrences(of: "\"", with: "\\\"")
}
func pointForElement(_ params: JSON) -> CGPoint? {
if let x = params["x"] as? Double, let y = params["y"] as? Double {
return CGPoint(x: x, y: y)
}
guard let stateId = params["stateId"] as? String,
let elementIndex = params["element_index"] as? String,
let element = stateElements[stateId]?.first(where: { $0.index == elementIndex }),
let b = element.bounds,
let x = b["x"], let y = b["y"], let width = b["width"], let height = b["height"]
else {
return nil
}
return CGPoint(x: x + width / 2, y: y + height / 2)
}
func click(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
cgMouseButton(params["mouse_button"]) == .left,
(params["click_count"] as? Int ?? 1) == 1,
actions(element).contains(kAXPressAction as String),
AXUIElementPerformAction(element, kAXPressAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "click_element requires x/y or stateId + element_index"])
}
let clickCount = params["click_count"] as? Int ?? 1
try postMouseClick(point: point, button: cgMouseButton(params["mouse_button"]), clickCount: clickCount)
return try getAppState(params)
}
func performSecondaryAction(_ params: JSON) throws -> JSON {
if let element = cachedElement(params),
actions(element).contains(kAXShowMenuAction as String),
AXUIElementPerformAction(element, kAXShowMenuAction as CFString) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "perform_secondary_action requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .right)
return try getAppState(params)
}
func setValue(_ params: JSON) throws -> JSON {
guard let value = params["value"] as? String else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires value"])
}
if let element = cachedElement(params),
AXUIElementSetAttributeValue(element, kAXValueAttribute as CFString, value as CFTypeRef) == .success {
return try getAppState(params)
}
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires x/y or stateId + element_index"])
}
try postMouseClick(point: point, button: .left)
try runAppleScript("tell application \"System Events\" to keystroke \"a\" using command down")
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(value))\"")
return try getAppState(params)
}
func typeText(_ params: JSON) throws -> JSON {
let text = params["text"] as? String ?? ""
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(text))\"")
return try getAppState(params)
}
func appleScriptModifiers(_ parts: [String]) -> String {
let modifiers = parts.compactMap { part -> String? in
switch part.lowercased() {
case "cmd", "command", "meta": return "command down"
case "ctrl", "control": return "control down"
case "alt", "option": return "option down"
case "shift": return "shift down"
default: return nil
}
}
return modifiers.isEmpty ? "" : " using {\(modifiers.joined(separator: ", "))}"
}
func appleScriptKeyCode(_ key: String) -> Int? {
switch key.lowercased() {
case "return", "enter": return 36
case "tab": return 48
case "space": return 49
case "delete", "backspace": return 51
case "escape", "esc": return 53
case "left": return 123
case "right": return 124
case "down": return 125
case "up": return 126
default: return nil
}
}
func pressKey(_ params: JSON) throws -> JSON {
let raw = params["key"] as? String ?? ""
let parts = raw.split(separator: "+").map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
let key = parts.last ?? raw
let modifiers = appleScriptModifiers(Array(parts.dropLast()))
if let keyCode = appleScriptKeyCode(key) {
try runAppleScript("tell application \"System Events\" to key code \(keyCode)\(modifiers)")
} else {
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(key))\"\(modifiers)")
}
return try getAppState(params)
}
func scrollElement(_ params: JSON) throws -> JSON {
guard let point = pointForElement(params) else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "scroll_element requires x/y or stateId + element_index"])
}
CGWarpMouseCursorPosition(point)
let direction = params["direction"] as? String ?? "down"
let pages = params["pages"] as? Double ?? 1.0
let amount = Int32(max(1.0, abs(pages) * 8.0))
let vertical = direction == "up" ? amount : direction == "down" ? -amount : 0
let horizontal = direction == "left" ? amount : direction == "right" ? -amount : 0
CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 2, wheel1: vertical, wheel2: horizontal)?.post(tap: .cghidEventTap)
return try getAppState(params)
}
func drag(_ params: JSON) throws -> JSON {
guard let fromX = params["from_x"] as? Double,
let fromY = params["from_y"] as? Double,
let toX = params["to_x"] as? Double,
let toY = params["to_y"] as? Double
else {
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "drag requires from_x/from_y/to_x/to_y"])
}
try postDrag(from: CGPoint(x: fromX, y: fromY), to: CGPoint(x: toX, y: toY), button: cgMouseButton(params["mouse_button"]))
return try getAppState(params)
}
func handle(_ request: JSON) {
let id = request["id"]
let method = request["method"] as? String ?? ""
let params = request["params"] as? JSON ?? [:]
do {
switch method {
case "list_apps":
respond(id: id, result: listApps())
case "get_app_state":
respond(id: id, result: try getAppState(params))
case "click_element":
respond(id: id, result: try click(params))
case "perform_secondary_action":
respond(id: id, result: try performSecondaryAction(params))
case "set_value":
respond(id: id, result: try setValue(params))
case "type_text":
respond(id: id, result: try typeText(params))
case "press_key":
respond(id: id, result: try pressKey(params))
case "scroll_element":
respond(id: id, result: try scrollElement(params))
case "drag":
respond(id: id, result: try drag(params))
default:
respondError(id: id, "Method is not implemented yet: \(method)")
}
} catch {
respondError(id: id, error.localizedDescription)
}
}
while let line = readLine() {
guard let data = line.data(using: .utf8),
let object = try? JSONSerialization.jsonObject(with: data),
let request = object as? JSON
else {
respondError(id: nil, "Invalid JSON request")
continue
}
handle(request)
}

View File

@@ -0,0 +1,124 @@
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import readline from 'node:readline';
type JsonRecord = Record<string, unknown>;
type PendingRequest = {
resolve: (value: unknown) => void;
reject: (error: Error) => void;
timer: ReturnType<typeof setTimeout>;
};
const DEFAULT_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_SEMANTICS_HELPER_TIMEOUT_MS || '60000', 10);
function timeoutMs(): number {
return Number.isFinite(DEFAULT_TIMEOUT_MS) && DEFAULT_TIMEOUT_MS > 0 ? DEFAULT_TIMEOUT_MS : 60000;
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
export class SemanticHelperProcess {
private child: ChildProcessWithoutNullStreams | null = null;
private reader: readline.Interface | null = null;
private nextId = 1;
private pending = new Map<number, PendingRequest>();
constructor(private readonly executablePath: string) {}
async request(method: string, params: JsonRecord): Promise<unknown> {
this.ensureStarted();
const child = this.child;
if (!child?.stdin.writable) {
throw new Error('Semantic helper process is not running.');
}
const id = this.nextId++;
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
this.pending.delete(id);
reject(new Error(`Semantic helper request timed out: ${method}`));
}, timeoutMs());
this.pending.set(id, { resolve, reject, timer });
child.stdin.write(`${JSON.stringify({ id, method, params })}\n`);
});
}
stop(): void {
const child = this.child;
this.child = null;
this.reader?.close();
this.reader = null;
this.rejectAll('Semantic helper stopped.');
if (child) {
try { child.kill('SIGTERM'); } catch { /* noop */ }
}
}
private ensureStarted(): void {
if (this.child) {
return;
}
this.child = spawn(this.executablePath, [], {
stdio: ['pipe', 'pipe', 'pipe'],
windowsHide: true,
});
this.reader = readline.createInterface({ input: this.child.stdout });
this.reader.on('line', (line) => this.handleLine(line));
this.child.stderr.on('data', (chunk) => {
const text = String(chunk).trim();
if (text) {
console.error('[SemanticHelper]', text);
}
});
this.child.once('error', (error) => {
this.child = null;
this.rejectAll(`Failed to start semantic helper: ${error.message}`);
});
this.child.once('exit', (code) => {
this.child = null;
this.rejectAll(`Semantic helper exited with code ${code ?? 'null'}.`);
});
}
private handleLine(line: string): void {
let message: JsonRecord;
try {
message = JSON.parse(line) as JsonRecord;
} catch (error) {
console.error('[SemanticHelper] Invalid JSON response:', errorMessage(error));
return;
}
const id = typeof message.id === 'number' ? message.id : null;
if (id === null) {
return;
}
const pending = this.pending.get(id);
if (!pending) {
return;
}
clearTimeout(pending.timer);
this.pending.delete(id);
if (message.error) {
pending.reject(new Error(typeof message.error === 'string' ? message.error : 'Semantic helper request failed.'));
return;
}
pending.resolve(message.result);
}
private rejectAll(reason: string): void {
for (const [id, request] of this.pending.entries()) {
clearTimeout(request.timer);
request.reject(new Error(reason));
this.pending.delete(id);
}
}
}

View File

@@ -0,0 +1,97 @@
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export type SemanticHelperPlatform = 'darwin' | 'win32';
export type SemanticHelperResolution = {
available: boolean;
path: string | null;
source: 'bundled' | 'dev' | 'missing';
platform: NodeJS.Platform;
arch: NodeJS.Architecture;
reason?: string;
};
function helperExecutableName(platform: NodeJS.Platform): string | null {
if (platform === 'darwin') {
return 'CloudCLISemantics';
}
if (platform === 'win32') {
return 'CloudCLISemantics.exe';
}
return null;
}
function pathExists(filePath: string): boolean {
try {
fs.accessSync(filePath, fs.constants.X_OK);
return true;
} catch {
try {
fs.accessSync(filePath, fs.constants.F_OK);
return true;
} catch {
return false;
}
}
}
function candidatePaths(platform: NodeJS.Platform, arch: NodeJS.Architecture): Array<{ source: 'bundled' | 'dev'; path: string }> {
const executable = helperExecutableName(platform);
if (!executable) {
return [];
}
const platformArch = `${platform}-${arch}`;
return [
{
source: 'bundled',
path: path.resolve(__dirname, '..', 'bin', platformArch, executable),
},
{
source: 'dev',
path: path.resolve(process.cwd(), 'server', 'modules', 'computer-use', 'semantics', 'bin', platformArch, executable),
},
];
}
export function resolveSemanticHelper(
platform: NodeJS.Platform = process.platform,
arch: NodeJS.Architecture = process.arch,
): SemanticHelperResolution {
const executable = helperExecutableName(platform);
if (!executable) {
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Semantic Computer Use helper is not supported on ${platform}.`,
};
}
for (const candidate of candidatePaths(platform, arch)) {
if (pathExists(candidate.path)) {
return {
available: true,
path: candidate.path,
source: candidate.source,
platform,
arch,
};
}
}
return {
available: false,
path: null,
source: 'missing',
platform,
arch,
reason: `Bundled semantic helper was not found for ${platform}-${arch} (${executable}).`,
};
}

View File

@@ -0,0 +1,10 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0-windows</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UseWindowsForms>true</UseWindowsForms>
<AssemblyName>CloudCLISemantics</AssemblyName>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,519 @@
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using System.Text.Json;
using System.Windows.Automation;
static class Program
{
private static readonly Dictionary<string, List<ElementRecord>> StateElements = new();
private static readonly Dictionary<string, Dictionary<string, AutomationElement>> StateAutomationElements = new();
public static void Main()
{
string? line;
while ((line = Console.ReadLine()) != null)
{
try
{
using var doc = JsonDocument.Parse(line);
var root = doc.RootElement;
var id = root.TryGetProperty("id", out var idValue) ? idValue.Clone() : default;
var method = root.TryGetProperty("method", out var methodValue) ? methodValue.GetString() ?? "" : "";
var parameters = root.TryGetProperty("params", out var paramsValue) && paramsValue.ValueKind == JsonValueKind.Object
? paramsValue.Clone()
: JsonDocument.Parse("{}").RootElement.Clone();
try
{
object result = method switch
{
"list_apps" => ListApps(),
"get_app_state" => GetAppState(parameters),
"click_element" => ClickElement(parameters),
"perform_secondary_action" => PerformSecondaryAction(parameters),
"set_value" => SetValue(parameters),
"type_text" => TypeText(parameters),
"press_key" => PressKey(parameters),
"scroll_element" => ScrollElement(parameters),
"drag" => Drag(parameters),
_ => throw new InvalidOperationException($"Method is not implemented yet: {method}")
};
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["result"] = result });
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["error"] = ex.Message });
}
}
catch (Exception ex)
{
Write(new Dictionary<string, object?> { ["id"] = null, ["error"] = $"Invalid JSON request: {ex.Message}" });
}
}
}
private static object? JsonValue(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString(),
JsonValueKind.Number => element.TryGetInt64(out var number) ? number : element.GetDouble(),
JsonValueKind.True => true,
JsonValueKind.False => false,
_ => null
};
}
private static void Write(object value)
{
Console.WriteLine(JsonSerializer.Serialize(value));
Console.Out.Flush();
}
private static List<Dictionary<string, object?>> ListApps()
{
return Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.OrderBy(process => process.ProcessName)
.Select(process => new Dictionary<string, object?>
{
["id"] = process.Id.ToString(),
["name"] = process.ProcessName,
["processName"] = process.ProcessName,
["pid"] = process.Id,
["running"] = true,
["windowTitle"] = process.MainWindowTitle
})
.ToList();
}
private static Process ResolveProcess(string query)
{
var normalized = query.Trim();
if (string.IsNullOrWhiteSpace(normalized))
{
throw new InvalidOperationException("app is required.");
}
var processes = Process.GetProcesses()
.Where(process => process.MainWindowHandle != IntPtr.Zero)
.ToList();
return processes.FirstOrDefault(process => process.ProcessName.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Equals(normalized, StringComparison.OrdinalIgnoreCase))
?? processes.FirstOrDefault(process => process.MainWindowTitle.Contains(normalized, StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"App is not running: {query}");
}
private static Dictionary<string, object?> GetAppState(JsonElement parameters)
{
var appQuery = ReadString(parameters, "app");
var process = ResolveProcess(appQuery);
var root = AutomationElement.FromHandle(process.MainWindowHandle)
?? throw new InvalidOperationException("No UI Automation root window is available.");
var records = new List<ElementRecord>();
var automationElements = new Dictionary<string, AutomationElement>();
Walk(root, records, automationElements, 0, 5, 300);
var stateId = $"state_{Guid.NewGuid()}";
StateElements[stateId] = records;
StateAutomationElements[stateId] = automationElements;
var elements = records.Select(record => record.ToDictionary()).ToList();
var bounds = root.Current.BoundingRectangle;
return new Dictionary<string, object?>
{
["stateId"] = stateId,
["app"] = process.ProcessName,
["platform"] = "win32",
["screenshotDataUrl"] = CaptureScreen(),
["displaySize"] = new Dictionary<string, object?>
{
["width"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Width,
["height"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Height
},
["window"] = new Dictionary<string, object?>
{
["title"] = process.MainWindowTitle,
["bounds"] = BoundsDictionary(bounds)
},
["elements"] = elements,
["accessibilityTree"] = elements,
["treeText"] = string.Join("\n", elements.Select(element => $"{element["index"]} {element["role"]} {element.GetValueOrDefault("title")}"))
};
}
private static Dictionary<string, object?> ClickElement(JsonElement parameters)
{
var mouseButton = ReadString(parameters, "mouse_button");
if ((mouseButton == "" || mouseButton == "left") && ReadInt(parameters, "click_count", 1) == 1)
{
var element = AutomationElementFor(parameters);
if (element != null && TryInvoke(element))
{
return GetAppState(parameters);
}
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("click_element requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, ReadString(parameters, "mouse_button"), ReadInt(parameters, "click_count", 1));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PerformSecondaryAction(JsonElement parameters)
{
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("perform_secondary_action requires x/y or stateId + element_index.");
}
SendMouseClick(point.Value.X, point.Value.Y, "right", 1);
return GetAppState(parameters);
}
private static Dictionary<string, object?> SetValue(JsonElement parameters)
{
var value = ReadString(parameters, "value");
var element = AutomationElementFor(parameters);
var focused = false;
if (element != null)
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var valuePattern))
{
((ValuePattern)valuePattern).SetValue(value);
return GetAppState(parameters);
}
try
{
element.SetFocus();
focused = true;
}
catch
{
// Fall through to coordinate focus below.
}
}
var point = PointFor(parameters);
if (point != null)
{
SendMouseClick(point.Value.X, point.Value.Y, "left", 1);
focused = true;
}
else if (!focused && element == null)
{
throw new InvalidOperationException("set_value requires x/y or stateId + element_index.");
}
else if (!focused)
{
throw new InvalidOperationException("set_value could not focus the requested element.");
}
System.Windows.Forms.SendKeys.SendWait("^a");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(value));
return GetAppState(parameters);
}
private static Dictionary<string, object?> TypeText(JsonElement parameters)
{
var text = ReadString(parameters, "text");
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(text));
return GetAppState(parameters);
}
private static Dictionary<string, object?> PressKey(JsonElement parameters)
{
var key = ReadString(parameters, "key");
System.Windows.Forms.SendKeys.SendWait(ToSendKeysChord(key));
return GetAppState(parameters);
}
private static Dictionary<string, object?> ScrollElement(JsonElement parameters)
{
var element = AutomationElementFor(parameters);
var direction = ReadString(parameters, "direction");
var pages = ReadDouble(parameters, "pages", 1);
if (element != null && element.TryGetCurrentPattern(ScrollPattern.Pattern, out var scrollPatternValue))
{
var scrollPattern = (ScrollPattern)scrollPatternValue;
var vertical = direction == "up" ? ScrollAmount.LargeDecrement : direction == "down" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
var horizontal = direction == "left" ? ScrollAmount.LargeDecrement : direction == "right" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
scrollPattern.Scroll(horizontal, vertical);
return GetAppState(parameters);
}
var point = PointFor(parameters);
if (point == null)
{
throw new InvalidOperationException("scroll_element requires x/y or stateId + element_index.");
}
SetCursorPos(point.Value.X, point.Value.Y);
var wheel = (int)Math.Round(Math.Max(1, pages) * 120);
if (direction == "up") wheel = -wheel;
mouse_event(0x0800, 0, 0, unchecked((uint)wheel), UIntPtr.Zero);
return GetAppState(parameters);
}
private static Dictionary<string, object?> Drag(JsonElement parameters)
{
var fromX = ReadDouble(parameters, "from_x", double.NaN);
var fromY = ReadDouble(parameters, "from_y", double.NaN);
var toX = ReadDouble(parameters, "to_x", double.NaN);
var toY = ReadDouble(parameters, "to_y", double.NaN);
if (double.IsNaN(fromX) || double.IsNaN(fromY) || double.IsNaN(toX) || double.IsNaN(toY))
{
throw new InvalidOperationException("drag requires from_x/from_y/to_x/to_y.");
}
SetCursorPos((int)Math.Round(fromX), (int)Math.Round(fromY));
mouse_event(0x0002, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
SetCursorPos((int)Math.Round(toX), (int)Math.Round(toY));
Thread.Sleep(80);
mouse_event(0x0004, 0, 0, 0, UIntPtr.Zero);
return GetAppState(parameters);
}
private static void Walk(AutomationElement element, List<ElementRecord> records, Dictionary<string, AutomationElement> automationElements, int depth, int maxDepth, int limit)
{
if (depth > maxDepth || records.Count >= limit) return;
var index = (records.Count + 1).ToString();
records.Add(ElementRecord.From(element, index));
automationElements[index] = element;
var children = element.FindAll(TreeScope.Children, Condition.TrueCondition);
foreach (AutomationElement child in children)
{
Walk(child, records, automationElements, depth + 1, maxDepth, limit);
if (records.Count >= limit) return;
}
}
private static string ReadString(JsonElement element, string property)
{
return element.TryGetProperty(property, out var value) && value.ValueKind == JsonValueKind.String
? value.GetString() ?? ""
: "";
}
private static int ReadInt(JsonElement element, string property, int defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetInt32(out var number)
? number
: defaultValue;
}
private static double ReadDouble(JsonElement element, string property, double defaultValue)
{
return element.TryGetProperty(property, out var value) && value.TryGetDouble(out var number)
? number
: defaultValue;
}
private static AutomationElement? AutomationElementFor(JsonElement parameters)
{
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
return !string.IsNullOrWhiteSpace(stateId)
&& !string.IsNullOrWhiteSpace(elementIndex)
&& StateAutomationElements.TryGetValue(stateId, out var elements)
&& elements.TryGetValue(elementIndex, out var element)
? element
: null;
}
private static System.Drawing.Point? PointFor(JsonElement parameters)
{
if (parameters.TryGetProperty("x", out var xValue) && parameters.TryGetProperty("y", out var yValue)
&& xValue.TryGetDouble(out var x) && yValue.TryGetDouble(out var y))
{
return new System.Drawing.Point((int)Math.Round(x), (int)Math.Round(y));
}
var stateId = ReadString(parameters, "stateId");
var elementIndex = ReadString(parameters, "element_index");
if (string.IsNullOrWhiteSpace(stateId) || string.IsNullOrWhiteSpace(elementIndex)) return null;
if (!StateElements.TryGetValue(stateId, out var elements)) return null;
var element = elements.FirstOrDefault(item => item.Index == elementIndex);
if (element?.Bounds == null) return null;
return new System.Drawing.Point(
(int)Math.Round(element.Bounds.Value.Left + element.Bounds.Value.Width / 2),
(int)Math.Round(element.Bounds.Value.Top + element.Bounds.Value.Height / 2)
);
}
private static string CaptureScreen()
{
var bounds = System.Windows.Forms.Screen.PrimaryScreen!.Bounds;
using var bitmap = new Bitmap(bounds.Width, bounds.Height);
using var graphics = Graphics.FromImage(bitmap);
graphics.CopyFromScreen(bounds.Left, bounds.Top, 0, 0, bounds.Size);
using var stream = new MemoryStream();
bitmap.Save(stream, ImageFormat.Png);
return $"data:image/png;base64,{Convert.ToBase64String(stream.ToArray())}";
}
private static Dictionary<string, object?> BoundsDictionary(System.Windows.Rect rect)
{
return new Dictionary<string, object?>
{
["x"] = rect.X,
["y"] = rect.Y,
["width"] = rect.Width,
["height"] = rect.Height
};
}
[DllImport("user32.dll")]
private static extern bool SetCursorPos(int x, int y);
[DllImport("user32.dll")]
private static extern void mouse_event(uint dwFlags, uint dx, uint dy, uint dwData, UIntPtr dwExtraInfo);
private static void SendMouseClick(int x, int y, string button, int clickCount)
{
var (down, up) = button switch
{
"right" => (0x0008u, 0x0010u),
"middle" => (0x0020u, 0x0040u),
_ => (0x0002u, 0x0004u)
};
SetCursorPos(x, y);
for (var i = 0; i < Math.Max(1, clickCount); i++)
{
mouse_event(down, 0, 0, 0, UIntPtr.Zero);
mouse_event(up, 0, 0, 0, UIntPtr.Zero);
Thread.Sleep(80);
}
}
private static bool TryInvoke(AutomationElement element)
{
try
{
if (!element.TryGetCurrentPattern(InvokePattern.Pattern, out var pattern)) return false;
((InvokePattern)pattern).Invoke();
return true;
}
catch
{
return false;
}
}
private static string EscapeSendKeys(string value)
{
return value
.Replace("{", "{{}")
.Replace("}", "{}}")
.Replace("+", "{+}")
.Replace("^", "{^}")
.Replace("%", "{%}")
.Replace("~", "{~}")
.Replace("(", "{(}")
.Replace(")", "{)}")
.Replace("[", "{[}")
.Replace("]", "{]}");
}
private static string ToSendKeysChord(string key)
{
var normalized = key.Trim();
if (normalized.Contains('+'))
{
var parts = normalized.Split('+', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
var modifiers = "";
var last = parts.LastOrDefault() ?? "";
foreach (var part in parts.Take(parts.Length - 1))
{
modifiers += part.ToLowerInvariant() switch
{
"ctrl" or "control" => "^",
"alt" => "%",
"shift" => "+",
"cmd" or "win" or "windows" => "^",
_ => ""
};
}
return modifiers + SendKeyName(last);
}
return SendKeyName(normalized);
}
private static string SendKeyName(string key)
{
return key.ToLowerInvariant() switch
{
"return" or "enter" => "{ENTER}",
"escape" or "esc" => "{ESC}",
"tab" => "{TAB}",
"backspace" => "{BACKSPACE}",
"delete" or "del" => "{DELETE}",
"left" => "{LEFT}",
"right" => "{RIGHT}",
"up" => "{UP}",
"down" => "{DOWN}",
"space" => " ",
_ => key.Length == 1 ? EscapeSendKeys(key) : $"{{{key.ToUpperInvariant()}}}"
};
}
private sealed record ElementRecord(
string Index,
string Role,
string? Title,
string? Value,
System.Windows.Rect? Bounds,
List<string> Actions)
{
public static ElementRecord From(AutomationElement element, string index)
{
var patterns = element.GetSupportedPatterns().Select(pattern => pattern.ProgrammaticName).ToList();
return new ElementRecord(
index,
element.Current.ControlType.ProgrammaticName.Replace("ControlType.", ""),
element.Current.Name,
TryValue(element),
element.Current.BoundingRectangle,
patterns
);
}
public Dictionary<string, object?> ToDictionary()
{
var output = new Dictionary<string, object?>
{
["index"] = Index,
["role"] = Role,
["actions"] = Actions
};
if (!string.IsNullOrEmpty(Title)) output["title"] = Title;
if (!string.IsNullOrEmpty(Value)) output["value"] = Value;
if (Bounds != null) output["bounds"] = BoundsDictionary(Bounds.Value);
return output;
}
private static string? TryValue(AutomationElement element)
{
try
{
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var pattern))
{
return ((ValuePattern)pattern).Current.Value;
}
}
catch
{
return null;
}
return null;
}
}
}

View File

@@ -0,0 +1,83 @@
import { randomUUID } from 'node:crypto';
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
const DEFAULT_STATE_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_SEMANTIC_STATE_TTL_MS || String(10 * 60 * 1000), 10);
type StoredState = {
sessionId: string;
appKey: string;
state: SemanticAppState;
updatedAt: number;
};
function normalizeAppKey(app: string): string {
return app.trim().toLowerCase();
}
export class SemanticSessionStore {
private states = new Map<string, StoredState>();
private latestBySessionApp = new Map<string, string>();
createStateId(): string {
return `state_${randomUUID()}`;
}
save(sessionId: string, state: SemanticAppState): SemanticAppState {
const appKey = normalizeAppKey(state.app);
const nextState = {
...state,
stateId: state.stateId || this.createStateId(),
};
this.states.set(nextState.stateId, {
sessionId,
appKey,
state: nextState,
updatedAt: Date.now(),
});
this.latestBySessionApp.set(this.latestKey(sessionId, appKey), nextState.stateId);
return nextState;
}
getState(sessionId: string, app: string, stateId?: string): SemanticAppState | null {
this.expire();
if (stateId) {
const entry = this.states.get(stateId);
return entry && entry.sessionId === sessionId ? entry.state : null;
}
const latestStateId = this.latestBySessionApp.get(this.latestKey(sessionId, normalizeAppKey(app)));
return latestStateId ? this.states.get(latestStateId)?.state || null : null;
}
getElement(sessionId: string, app: string, elementIndex: string, stateId?: string): SemanticElement | null {
const state = this.getState(sessionId, app, stateId);
return state?.elements.find((element) => element.index === elementIndex) || null;
}
clearSession(sessionId: string): void {
for (const [stateId, entry] of this.states.entries()) {
if (entry.sessionId === sessionId) {
this.states.delete(stateId);
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
}
}
}
expire(now = Date.now()): void {
const ttl = Number.isFinite(DEFAULT_STATE_TTL_MS) && DEFAULT_STATE_TTL_MS > 0
? DEFAULT_STATE_TTL_MS
: 10 * 60 * 1000;
for (const [stateId, entry] of this.states.entries()) {
if (now - entry.updatedAt > ttl) {
this.states.delete(stateId);
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
}
}
}
private latestKey(sessionId: string, appKey: string): string {
return `${sessionId}:${appKey}`;
}
}
export const semanticSessionStore = new SemanticSessionStore();

View File

@@ -0,0 +1,17 @@
export const semanticMcpToolMap: Record<string, string> = {
computer_app_drag: 'drag',
computer_click_element: 'click',
computer_get_app_state: 'get_app_state',
computer_list_apps: 'list_apps',
computer_perform_secondary_action: 'perform_secondary_action',
computer_press_key: 'press_key',
computer_scroll_element: 'scroll',
computer_set_value: 'set_value',
computer_type_text: 'type_text',
};
export const semanticOperationNames = new Set(Object.values(semanticMcpToolMap));
export function semanticOperationForMcpTool(toolName: string): string | null {
return semanticMcpToolMap[toolName] || null;
}

View File

@@ -0,0 +1,58 @@
import type { DisplaySize, Point } from '@/modules/computer-use/computer-executor.js';
export type SemanticBounds = {
x: number;
y: number;
width: number;
height: number;
};
export type SemanticApp = {
id?: string;
name: string;
bundleIdentifier?: string;
processName?: string;
pid?: number;
running: boolean;
windowTitle?: string;
};
export type SemanticElement = {
index: string;
role: string;
title?: string;
value?: string;
description?: string;
enabled?: boolean;
focused?: boolean;
selected?: boolean;
bounds?: SemanticBounds;
actions?: string[];
settableValue?: boolean;
};
export type SemanticAppState = {
stateId: string;
app: string;
platform: NodeJS.Platform;
screenshotDataUrl: string | null;
displaySize: DisplaySize | null;
elements: SemanticElement[];
accessibilityTree: SemanticElement[];
treeText?: string;
message?: string;
};
export type SemanticToolInput = Record<string, unknown> & {
sessionId?: string;
app?: string;
stateId?: string;
element_index?: string;
};
export type SemanticToolResult = SemanticAppState | {
apps: SemanticApp[];
platform: NodeJS.Platform;
};
export type SemanticActionPoint = Point;

View File

@@ -8,13 +8,10 @@ type ComputerUseStatus = {
enabled: boolean;
runtime: 'cloud' | 'local';
available: boolean;
requiresDesktopBridge: boolean;
nutInstalled: boolean;
screenshotInstalled: boolean;
installInProgress: boolean;
sessionCount: number;
agentToolsEnabled: boolean;
mcpRecommended: boolean;
message: string;
};
@@ -114,12 +111,6 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
}
}, [refresh]);
const createSession = () => runAction(async () => {
const response = await authenticatedFetch('/api/computer-use/sessions', { method: 'POST' });
const data = await readJson<{ data: { session: ComputerUseSession } }>(response);
setSelectedSessionId(data.data.session.id);
});
const captureScreenshot = () => runAction(async () => {
if (!selectedSession) return;
const response = await authenticatedFetch(`/api/computer-use/sessions/${selectedSession.id}/screenshot`, { method: 'POST' });
@@ -252,12 +243,12 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
<div className="max-w-md px-6 text-center">
<MonitorCog className="mx-auto h-10 w-10 text-neutral-500" />
<div className="mt-3 text-sm font-medium text-neutral-100">
{selectedSession?.message || 'Start a Computer Use session to capture your desktop.'}
{selectedSession?.message || 'No active Computer Use session.'}
</div>
<p className="mt-2 text-xs leading-relaxed text-neutral-400">
{isCloud
? 'Cloud Computer Use requires a linked local CloudCLI Desktop Agent.'
: 'Install the desktop control runtime from this panel or enable Computer Use from Settings.'}
? 'Agents create sessions automatically. Keep the CloudCLI desktop app connected to approve control requests.'
: 'Agents create sessions automatically. Enable Computer Use and install the local runtime if needed.'}
</p>
</div>
)}
@@ -274,7 +265,9 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
{status && <Badge variant="outline" className="text-[11px]">{status.runtime}</Badge>}
</div>
<p className="mt-0.5 text-xs text-muted-foreground">
Capture your desktop and let agents drive the mouse and keyboard only while you grant control.
{isCloud
? 'Monitor cloud agent desktop sessions and stop access when needed.'
: 'Monitor local desktop sessions and grant control only when an agent needs it.'}
</p>
</div>
<div className="flex flex-wrap items-center gap-2">
@@ -287,10 +280,6 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
<RefreshCw className="h-4 w-4" />
Refresh
</Button>
<Button size="sm" onClick={createSession} disabled={isBusy || !status?.available}>
<MonitorCog className="h-4 w-4" />
New Session
</Button>
</div>
</div>
@@ -333,13 +322,20 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
<ShieldCheck className="h-3.5 w-3.5" />
Safety
</div>
<p className="mt-1.5">
Agents can act on a session only while you have granted control. Use
<span className="font-medium text-foreground"> Grant Control </span>
to allow agent actions, and
<span className="font-medium text-foreground"> Stop </span>
to revoke instantly.
</p>
{isCloud ? (
<p className="mt-1.5">
Agents create sessions automatically through MCP. The CloudCLI desktop app asks for approval on this
computer, and <span className="font-medium text-foreground">Stop</span> ends the session and clears access.
</p>
) : (
<p className="mt-1.5">
Agents create sessions automatically through MCP but cannot act until you grant control here. Use
<span className="font-medium text-foreground"> Grant Control </span>
to allow agent actions, and
<span className="font-medium text-foreground"> Stop </span>
to revoke instantly.
</p>
)}
</div>
{sessions.map((session) => (
<button
@@ -373,7 +369,7 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
))}
{sessions.length === 0 && (
<div className="rounded-lg border border-dashed border-border/70 px-3 py-8 text-center text-xs text-muted-foreground">
No Computer Use sessions yet.
Agents will create sessions automatically when they need desktop access.
</div>
)}
</div>
@@ -385,22 +381,22 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
<Camera className="h-4 w-4" />
Screenshot
</Button>
{selectedSession?.agentAccessEnabled ? (
{!isCloud && selectedSession?.agentAccessEnabled ? (
<Button variant="outline" size="sm" onClick={revokeControl} disabled={isBusy || !selectedSession}>
<X className="h-4 w-4" />
Revoke Control
</Button>
) : (
) : !isCloud ? (
<Button
variant="outline"
size="sm"
onClick={grantControl}
disabled={isBusy || !selectedSession || selectedSession.status !== 'ready' || !status?.agentToolsEnabled}
disabled={isBusy || !selectedSession || selectedSession.status !== 'ready' || !status?.enabled}
>
<Bot className="h-4 w-4" />
Grant Control
</Button>
)}
) : null}
<Button variant="outline" size="sm" onClick={() => setIsFullscreen(true)} disabled={!selectedSession?.screenshotDataUrl}>
<Expand className="h-4 w-4" />
Full Screen
@@ -433,14 +429,16 @@ export default function ComputerUsePanel({ isVisible }: ComputerUsePanelProps) {
{selectedSession?.agentAccessEnabled && (
<span className="ml-auto inline-flex items-center gap-1 rounded border border-emerald-500/30 px-2 py-0.5 text-emerald-600 dark:text-emerald-300">
<Bot className="h-3.5 w-3.5" />
Agent control active
{isCloud ? 'Desktop-approved session' : 'Agent control active'}
</span>
)}
</div>
{renderSurface()}
</div>
<p className="mx-auto mt-2 max-w-6xl text-center text-xs text-muted-foreground">
Click the screenshot to click the real desktop. Focus the view and type to send keystrokes.
{selectedSession
? 'Click the screenshot to click the real desktop. Focus the view and type to send keystrokes.'
: 'Computer Use sessions appear here after an agent requests desktop access.'}
</p>
</div>
</main>

View File

@@ -121,9 +121,25 @@ function MainContent({
const loadComputerUseSettings = useCallback(async () => {
try {
const response = await authenticatedFetch('/api/computer-use/settings');
const data = await response.json();
setComputerUseEnabled(Boolean(response.ok && data?.success !== false && data?.data?.settings?.enabled));
const [settingsResponse, statusResponse] = await Promise.all([
authenticatedFetch('/api/computer-use/settings'),
authenticatedFetch('/api/computer-use/status'),
]);
const settingsData = await settingsResponse.json();
const statusData = await statusResponse.json();
const runtime = statusData?.data?.runtime;
const settingsEnabled = Boolean(
settingsResponse.ok &&
settingsData?.success !== false &&
settingsData?.data?.settings?.enabled
);
const cloudEnabled = Boolean(
statusResponse.ok &&
statusData?.success !== false &&
runtime === 'cloud' &&
statusData?.data?.enabled
);
setComputerUseEnabled(runtime === 'cloud' ? cloudEnabled : settingsEnabled);
} catch {
setComputerUseEnabled(false);
}

View File

@@ -10,17 +10,16 @@ import SettingsToggle from '../../SettingsToggle';
type ComputerUseSettings = {
enabled: boolean;
agentToolsEnabled: boolean;
};
type ComputerUseStatus = {
enabled: boolean;
runtime: 'cloud' | 'local';
available: boolean;
desktopAgentConnected?: boolean;
nutInstalled: boolean;
screenshotInstalled: boolean;
installInProgress: boolean;
agentToolsEnabled: boolean;
message: string;
};
@@ -33,7 +32,7 @@ async function readJson<T>(response: Response): Promise<T> {
}
export default function ComputerUseSettingsTab() {
const [settings, setSettings] = useState<ComputerUseSettings>({ enabled: false, agentToolsEnabled: false });
const [settings, setSettings] = useState<ComputerUseSettings>({ enabled: false });
const [status, setStatus] = useState<ComputerUseStatus | null>(null);
const [isLoading, setIsLoading] = useState(true);
const [isSaving, setIsSaving] = useState(false);
@@ -93,33 +92,61 @@ export default function ComputerUseSettingsTab() {
};
const isCloud = status?.runtime === 'cloud';
const needsRuntime = Boolean(settings.enabled && !isCloud && status && (!status.nutInstalled || !status.screenshotInstalled));
const effectiveEnabled = isCloud ? status?.enabled === true : settings.enabled;
const needsRuntime = Boolean(effectiveEnabled && !isCloud && status && (!status.nutInstalled || !status.screenshotInstalled));
const modeDescription = isCloud
? 'Cloud Computer Use connects a hosted agent to the CloudCLI desktop app on your machine. Agents create sessions automatically through MCP, and approval happens in the desktop app.'
: 'Local Computer Use runs on this machine. Agents create sessions automatically through MCP, but input actions require you to grant control from the Computer tab.';
return (
<div className="space-y-8">
<SettingsSection
title="Computer Use"
description="Let agents see your desktop and drive the mouse and keyboard through a guarded, consent-gated control loop."
description={modeDescription}
>
<SettingsCard divided>
<div className="flex flex-col gap-3 px-4 py-4">
<div className="rounded-md border border-amber-300/50 bg-amber-50 px-3 py-2 text-sm text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200">
Computer Use can control your entire desktop. Agents act only while you grant control from the
Computer panel, and any action stops the moment you press Stop.
{isCloud
? 'Computer Use can control your entire desktop after you approve the request in the CloudCLI desktop app. Use Stop in the Computer tab to end the active session.'
: 'Computer Use can control your entire desktop. Agents act only while you grant control from the Computer tab, and any action stops the moment you press Stop.'}
</div>
{effectiveEnabled && (
<div className="rounded-md border border-border bg-muted/40 px-3 py-2 text-sm text-muted-foreground">
{isCloud
? 'When a cloud agent needs desktop access, it will create a session automatically. Keep CloudCLI Desktop running and connected to this environment to receive approval prompts.'
: 'When a local agent needs desktop access, it will create a session automatically. Open the Computer tab to review the session, grant control, or stop it. On macOS, grant Accessibility and Screen Recording to CloudCLI Desktop if prompted.'}
</div>
)}
</div>
<SettingsRow
label="Enable Computer Use"
description="Registers Computer Use for supported agents and allows CloudCLI to create guarded desktop control sessions on this machine."
>
<SettingsToggle
checked={settings.enabled}
onChange={(value) => void updateSettings({ enabled: value, agentToolsEnabled: value })}
ariaLabel="Enable Computer Use"
disabled={isLoading || isSaving}
/>
</SettingsRow>
{isCloud ? (
<SettingsRow
label="Cloud desktop access"
description="Managed by the CloudCLI desktop app. Agents can use computer tools when a desktop agent is linked to this cloud environment."
>
<div className={`rounded-md border px-2.5 py-1 text-xs font-medium ${
status?.desktopAgentConnected
? 'border-emerald-500/30 text-emerald-600 dark:text-emerald-300'
: 'border-amber-500/30 text-amber-600 dark:text-amber-300'
}`}
>
{status?.desktopAgentConnected ? 'Desktop linked' : 'Desktop not linked'}
</div>
</SettingsRow>
) : (
<SettingsRow
label="Enable Computer Use"
description="Registers Computer Use for supported agents and allows CloudCLI to create guarded desktop control sessions on this machine."
>
<SettingsToggle
checked={settings.enabled}
onChange={(value) => void updateSettings({ enabled: value })}
ariaLabel="Enable Computer Use"
disabled={isLoading || isSaving}
/>
</SettingsRow>
)}
{(needsRuntime || isCloud || error) && (
<div className="space-y-4 px-4 py-4">