mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-20 07:52:00 +08:00
feat: add CloudCLI computer use semantics, desktop helper packaging, and permission onboarding
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
import { SemanticHelperProcess } from '@/modules/computer-use/semantics/helpers/semantic-helper-process.js';
|
||||
import { resolveSemanticHelper } from '@/modules/computer-use/semantics/helpers/semantic-helper-resolver.js';
|
||||
import type { SemanticAdapter, SemanticAdapterCapabilities } from '@/modules/computer-use/semantics/adapters/semantic-adapter.js';
|
||||
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
type HelperMethod =
|
||||
| 'list_apps'
|
||||
| 'get_app_state'
|
||||
| 'click_element'
|
||||
| 'perform_secondary_action'
|
||||
| 'set_value'
|
||||
| 'type_text'
|
||||
| 'press_key'
|
||||
| 'scroll_element'
|
||||
| 'drag';
|
||||
|
||||
export class HelperSemanticAdapter implements SemanticAdapter {
|
||||
private helper: SemanticHelperProcess | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly platform: NodeJS.Platform,
|
||||
private readonly arch: NodeJS.Architecture = process.arch,
|
||||
) {}
|
||||
|
||||
capabilities(): SemanticAdapterCapabilities {
|
||||
return {
|
||||
platform: this.platform,
|
||||
appDiscovery: true,
|
||||
accessibilityTree: true,
|
||||
nativeElementActions: true,
|
||||
nativeValueSetting: true,
|
||||
targetedInput: true,
|
||||
};
|
||||
}
|
||||
|
||||
async listApps(): Promise<SemanticApp[]> {
|
||||
return await this.request('list_apps', {}) as SemanticApp[];
|
||||
}
|
||||
|
||||
async getAppState(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('get_app_state', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async clickElement(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('click_element', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('perform_secondary_action', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async setValue(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('set_value', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async typeText(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('type_text', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async pressKey(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('press_key', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async scrollElement(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('scroll_element', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
async drag(input: SemanticToolInput): Promise<SemanticAppState> {
|
||||
return await this.request('drag', input) as SemanticAppState;
|
||||
}
|
||||
|
||||
private async request(method: HelperMethod, params: Record<string, unknown>): Promise<unknown> {
|
||||
if (!this.helper) {
|
||||
const resolution = resolveSemanticHelper(this.platform, this.arch);
|
||||
if (!resolution.available || !resolution.path) {
|
||||
throw new Error(resolution.reason || `Semantic helper is unavailable for ${this.platform}-${this.arch}.`);
|
||||
}
|
||||
this.helper = new SemanticHelperProcess(resolution.path);
|
||||
}
|
||||
return this.helper.request(method, params);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
|
||||
|
||||
export function createMacOsSemanticAdapter(): HelperSemanticAdapter {
|
||||
return new HelperSemanticAdapter('darwin');
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
import type { SemanticApp, SemanticAppState, SemanticToolInput } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
export type SemanticAdapterCapabilities = {
|
||||
platform: NodeJS.Platform;
|
||||
appDiscovery: boolean;
|
||||
accessibilityTree: boolean;
|
||||
nativeElementActions: boolean;
|
||||
nativeValueSetting: boolean;
|
||||
targetedInput: boolean;
|
||||
};
|
||||
|
||||
export type SemanticAdapter = {
|
||||
capabilities(): SemanticAdapterCapabilities;
|
||||
listApps(): Promise<SemanticApp[]>;
|
||||
getAppState(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
clickElement(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
performSecondaryAction(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
setValue(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
typeText(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
pressKey(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
scrollElement(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
drag(input: SemanticToolInput): Promise<SemanticAppState>;
|
||||
};
|
||||
@@ -0,0 +1,5 @@
|
||||
import { HelperSemanticAdapter } from '@/modules/computer-use/semantics/adapters/helper-semantic-adapter.js';
|
||||
|
||||
export function createWindowsSemanticAdapter(): HelperSemanticAdapter {
|
||||
return new HelperSemanticAdapter('win32');
|
||||
}
|
||||
@@ -0,0 +1,437 @@
|
||||
import AppKit
|
||||
import ApplicationServices
|
||||
import Foundation
|
||||
|
||||
typealias JSON = [String: Any]
|
||||
|
||||
struct ElementRecord {
|
||||
let index: String
|
||||
let role: String
|
||||
let title: String?
|
||||
let value: String?
|
||||
let bounds: [String: Double]?
|
||||
let actions: [String]
|
||||
}
|
||||
|
||||
var stateElements: [String: [ElementRecord]] = [:]
|
||||
var stateAxElements: [String: [String: AXUIElement]] = [:]
|
||||
|
||||
func jsonLine(_ object: Any) {
|
||||
guard JSONSerialization.isValidJSONObject(object),
|
||||
let data = try? JSONSerialization.data(withJSONObject: object),
|
||||
let text = String(data: data, encoding: .utf8)
|
||||
else {
|
||||
print("{\"error\":\"Failed to encode JSON\"}")
|
||||
fflush(stdout)
|
||||
return
|
||||
}
|
||||
print(text)
|
||||
fflush(stdout)
|
||||
}
|
||||
|
||||
func respond(id: Any?, result: Any) {
|
||||
jsonLine(["id": id ?? NSNull(), "result": result])
|
||||
}
|
||||
|
||||
func respondError(id: Any?, _ message: String) {
|
||||
jsonLine(["id": id ?? NSNull(), "error": message])
|
||||
}
|
||||
|
||||
func stringAttr(_ element: AXUIElement, _ attr: CFString) -> String? {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
|
||||
return value as? String
|
||||
}
|
||||
|
||||
func boolAttr(_ element: AXUIElement, _ attr: CFString) -> Bool? {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return nil }
|
||||
return value as? Bool
|
||||
}
|
||||
|
||||
func arrayAttr(_ element: AXUIElement, _ attr: CFString) -> [AXUIElement] {
|
||||
var value: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, attr, &value) == .success else { return [] }
|
||||
return value as? [AXUIElement] ?? []
|
||||
}
|
||||
|
||||
func actions(_ element: AXUIElement) -> [String] {
|
||||
var names: CFArray?
|
||||
guard AXUIElementCopyActionNames(element, &names) == .success else { return [] }
|
||||
return names as? [String] ?? []
|
||||
}
|
||||
|
||||
func bounds(_ element: AXUIElement) -> [String: Double]? {
|
||||
var positionRef: CFTypeRef?
|
||||
var sizeRef: CFTypeRef?
|
||||
guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionRef) == .success,
|
||||
AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeRef) == .success,
|
||||
let positionValue = positionRef,
|
||||
let sizeValue = sizeRef
|
||||
else { return nil }
|
||||
|
||||
var point = CGPoint.zero
|
||||
var size = CGSize.zero
|
||||
guard AXValueGetValue(positionValue as! AXValue, .cgPoint, &point),
|
||||
AXValueGetValue(sizeValue as! AXValue, .cgSize, &size)
|
||||
else { return nil }
|
||||
|
||||
return [
|
||||
"x": Double(point.x),
|
||||
"y": Double(point.y),
|
||||
"width": Double(size.width),
|
||||
"height": Double(size.height),
|
||||
]
|
||||
}
|
||||
|
||||
func record(_ element: AXUIElement, index: String) -> ElementRecord {
|
||||
ElementRecord(
|
||||
index: index,
|
||||
role: stringAttr(element, kAXRoleAttribute as CFString) ?? "AXUnknown",
|
||||
title: stringAttr(element, kAXTitleAttribute as CFString) ?? stringAttr(element, kAXDescriptionAttribute as CFString),
|
||||
value: stringAttr(element, kAXValueAttribute as CFString),
|
||||
bounds: bounds(element),
|
||||
actions: actions(element)
|
||||
)
|
||||
}
|
||||
|
||||
func cachedElement(_ params: JSON) -> AXUIElement? {
|
||||
guard let stateId = params["stateId"] as? String,
|
||||
let elementIndex = params["element_index"] as? String
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
return stateAxElements[stateId]?[elementIndex]
|
||||
}
|
||||
|
||||
func dictionary(_ record: ElementRecord) -> JSON {
|
||||
var output: JSON = [
|
||||
"index": record.index,
|
||||
"role": record.role,
|
||||
"actions": record.actions,
|
||||
]
|
||||
if let title = record.title { output["title"] = title }
|
||||
if let value = record.value { output["value"] = value }
|
||||
if let bounds = record.bounds { output["bounds"] = bounds }
|
||||
return output
|
||||
}
|
||||
|
||||
func resolveApp(_ query: String) throws -> NSRunningApplication {
|
||||
let normalized = query.lowercased()
|
||||
let apps = NSWorkspace.shared.runningApplications.filter { app in
|
||||
app.activationPolicy == .regular
|
||||
}
|
||||
if let app = apps.first(where: { $0.bundleIdentifier?.lowercased() == normalized }) {
|
||||
return app
|
||||
}
|
||||
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased() == normalized }) {
|
||||
return app
|
||||
}
|
||||
if let app = apps.first(where: { ($0.localizedName ?? "").lowercased().contains(normalized) }) {
|
||||
return app
|
||||
}
|
||||
throw NSError(domain: "CloudCLISemantics", code: 404, userInfo: [NSLocalizedDescriptionKey: "App is not running: \(query)"])
|
||||
}
|
||||
|
||||
func listApps() -> [[String: Any]] {
|
||||
NSWorkspace.shared.runningApplications
|
||||
.filter { $0.activationPolicy == .regular }
|
||||
.map { app in
|
||||
[
|
||||
"id": app.bundleIdentifier ?? app.localizedName ?? "\(app.processIdentifier)",
|
||||
"name": app.localizedName ?? app.bundleIdentifier ?? "Unknown",
|
||||
"bundleIdentifier": app.bundleIdentifier ?? "",
|
||||
"pid": Int(app.processIdentifier),
|
||||
"running": true,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
func walk(_ element: AXUIElement, depth: Int, maxDepth: Int, records: inout [ElementRecord], axRecords: inout [String: AXUIElement], limit: Int) {
|
||||
if depth > maxDepth || records.count >= limit { return }
|
||||
let index = "\(records.count + 1)"
|
||||
records.append(record(element, index: index))
|
||||
axRecords[index] = element
|
||||
for child in arrayAttr(element, kAXChildrenAttribute as CFString) {
|
||||
walk(child, depth: depth + 1, maxDepth: maxDepth, records: &records, axRecords: &axRecords, limit: limit)
|
||||
if records.count >= limit { return }
|
||||
}
|
||||
}
|
||||
|
||||
func pngDataUrlForMainDisplay() -> String? {
|
||||
guard let image = CGDisplayCreateImage(CGMainDisplayID()) else { return nil }
|
||||
let bitmap = NSBitmapImageRep(cgImage: image)
|
||||
guard let png = bitmap.representation(using: .png, properties: [:]) else { return nil }
|
||||
return "data:image/png;base64,\(png.base64EncodedString())"
|
||||
}
|
||||
|
||||
func getAppState(_ params: JSON) throws -> JSON {
|
||||
let appName = params["app"] as? String ?? ""
|
||||
let app = try resolveApp(appName)
|
||||
let axApp = AXUIElementCreateApplication(app.processIdentifier)
|
||||
let windows = arrayAttr(axApp, kAXWindowsAttribute as CFString)
|
||||
let root = windows.first ?? axApp
|
||||
var records: [ElementRecord] = []
|
||||
var axRecords: [String: AXUIElement] = [:]
|
||||
walk(root, depth: 0, maxDepth: 5, records: &records, axRecords: &axRecords, limit: 300)
|
||||
let stateId = "state_\(UUID().uuidString)"
|
||||
stateElements[stateId] = records
|
||||
stateAxElements[stateId] = axRecords
|
||||
|
||||
let elements = records.map(dictionary)
|
||||
return [
|
||||
"stateId": stateId,
|
||||
"app": app.localizedName ?? app.bundleIdentifier ?? appName,
|
||||
"platform": "darwin",
|
||||
"screenshotDataUrl": pngDataUrlForMainDisplay() ?? NSNull(),
|
||||
"displaySize": [
|
||||
"width": Int(CGDisplayPixelsWide(CGMainDisplayID())),
|
||||
"height": Int(CGDisplayPixelsHigh(CGMainDisplayID())),
|
||||
],
|
||||
"elements": elements,
|
||||
"accessibilityTree": elements,
|
||||
"treeText": elements.map { "\($0["index"] ?? "") \($0["role"] ?? "") \($0["title"] ?? "")" }.joined(separator: "\n"),
|
||||
]
|
||||
}
|
||||
|
||||
func cgMouseButton(_ value: Any?) -> CGMouseButton {
|
||||
guard let button = value as? String else { return .left }
|
||||
switch button {
|
||||
case "right": return .right
|
||||
case "middle": return .center
|
||||
default: return .left
|
||||
}
|
||||
}
|
||||
|
||||
func mouseEventTypes(_ button: CGMouseButton) -> (CGEventType, CGEventType) {
|
||||
switch button {
|
||||
case .right: return (.rightMouseDown, .rightMouseUp)
|
||||
case .center: return (.otherMouseDown, .otherMouseUp)
|
||||
default: return (.leftMouseDown, .leftMouseUp)
|
||||
}
|
||||
}
|
||||
|
||||
func postMouseClick(point: CGPoint, button: CGMouseButton, clickCount: Int = 1) throws {
|
||||
guard let source = CGEventSource(stateID: .combinedSessionState) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
|
||||
}
|
||||
let eventTypes = mouseEventTypes(button)
|
||||
for _ in 0..<max(1, clickCount) {
|
||||
let down = CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: point, mouseButton: button)
|
||||
let up = CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: point, mouseButton: button)
|
||||
down?.post(tap: .cghidEventTap)
|
||||
up?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
}
|
||||
}
|
||||
|
||||
func postDrag(from: CGPoint, to: CGPoint, button: CGMouseButton) throws {
|
||||
guard let source = CGEventSource(stateID: .combinedSessionState) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 500, userInfo: [NSLocalizedDescriptionKey: "Failed to create CGEventSource"])
|
||||
}
|
||||
let eventTypes = mouseEventTypes(button)
|
||||
CGEvent(mouseEventSource: source, mouseType: eventTypes.0, mouseCursorPosition: from, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
usleep(80_000)
|
||||
CGEvent(mouseEventSource: source, mouseType: eventTypes.1, mouseCursorPosition: to, mouseButton: button)?.post(tap: .cghidEventTap)
|
||||
}
|
||||
|
||||
func runAppleScript(_ script: String) throws {
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: "/usr/bin/osascript")
|
||||
process.arguments = ["-e", script]
|
||||
process.standardOutput = Pipe()
|
||||
let stderr = Pipe()
|
||||
process.standardError = stderr
|
||||
try process.run()
|
||||
process.waitUntilExit()
|
||||
if process.terminationStatus != 0 {
|
||||
let data = stderr.fileHandleForReading.readDataToEndOfFile()
|
||||
let message = String(data: data, encoding: .utf8) ?? "AppleScript failed."
|
||||
throw NSError(domain: "CloudCLISemantics", code: Int(process.terminationStatus), userInfo: [NSLocalizedDescriptionKey: message])
|
||||
}
|
||||
}
|
||||
|
||||
func escapedAppleScriptString(_ value: String) -> String {
|
||||
value.replacingOccurrences(of: "\\", with: "\\\\").replacingOccurrences(of: "\"", with: "\\\"")
|
||||
}
|
||||
|
||||
func pointForElement(_ params: JSON) -> CGPoint? {
|
||||
if let x = params["x"] as? Double, let y = params["y"] as? Double {
|
||||
return CGPoint(x: x, y: y)
|
||||
}
|
||||
guard let stateId = params["stateId"] as? String,
|
||||
let elementIndex = params["element_index"] as? String,
|
||||
let element = stateElements[stateId]?.first(where: { $0.index == elementIndex }),
|
||||
let b = element.bounds,
|
||||
let x = b["x"], let y = b["y"], let width = b["width"], let height = b["height"]
|
||||
else {
|
||||
return nil
|
||||
}
|
||||
return CGPoint(x: x + width / 2, y: y + height / 2)
|
||||
}
|
||||
|
||||
func click(_ params: JSON) throws -> JSON {
|
||||
if let element = cachedElement(params),
|
||||
cgMouseButton(params["mouse_button"]) == .left,
|
||||
(params["click_count"] as? Int ?? 1) == 1,
|
||||
actions(element).contains(kAXPressAction as String),
|
||||
AXUIElementPerformAction(element, kAXPressAction as CFString) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "click_element requires x/y or stateId + element_index"])
|
||||
}
|
||||
let clickCount = params["click_count"] as? Int ?? 1
|
||||
try postMouseClick(point: point, button: cgMouseButton(params["mouse_button"]), clickCount: clickCount)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func performSecondaryAction(_ params: JSON) throws -> JSON {
|
||||
if let element = cachedElement(params),
|
||||
actions(element).contains(kAXShowMenuAction as String),
|
||||
AXUIElementPerformAction(element, kAXShowMenuAction as CFString) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "perform_secondary_action requires x/y or stateId + element_index"])
|
||||
}
|
||||
try postMouseClick(point: point, button: .right)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func setValue(_ params: JSON) throws -> JSON {
|
||||
guard let value = params["value"] as? String else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires value"])
|
||||
}
|
||||
if let element = cachedElement(params),
|
||||
AXUIElementSetAttributeValue(element, kAXValueAttribute as CFString, value as CFTypeRef) == .success {
|
||||
return try getAppState(params)
|
||||
}
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "set_value requires x/y or stateId + element_index"])
|
||||
}
|
||||
try postMouseClick(point: point, button: .left)
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"a\" using command down")
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(value))\"")
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func typeText(_ params: JSON) throws -> JSON {
|
||||
let text = params["text"] as? String ?? ""
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(text))\"")
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func appleScriptModifiers(_ parts: [String]) -> String {
|
||||
let modifiers = parts.compactMap { part -> String? in
|
||||
switch part.lowercased() {
|
||||
case "cmd", "command", "meta": return "command down"
|
||||
case "ctrl", "control": return "control down"
|
||||
case "alt", "option": return "option down"
|
||||
case "shift": return "shift down"
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
return modifiers.isEmpty ? "" : " using {\(modifiers.joined(separator: ", "))}"
|
||||
}
|
||||
|
||||
func appleScriptKeyCode(_ key: String) -> Int? {
|
||||
switch key.lowercased() {
|
||||
case "return", "enter": return 36
|
||||
case "tab": return 48
|
||||
case "space": return 49
|
||||
case "delete", "backspace": return 51
|
||||
case "escape", "esc": return 53
|
||||
case "left": return 123
|
||||
case "right": return 124
|
||||
case "down": return 125
|
||||
case "up": return 126
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
|
||||
func pressKey(_ params: JSON) throws -> JSON {
|
||||
let raw = params["key"] as? String ?? ""
|
||||
let parts = raw.split(separator: "+").map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
|
||||
let key = parts.last ?? raw
|
||||
let modifiers = appleScriptModifiers(Array(parts.dropLast()))
|
||||
if let keyCode = appleScriptKeyCode(key) {
|
||||
try runAppleScript("tell application \"System Events\" to key code \(keyCode)\(modifiers)")
|
||||
} else {
|
||||
try runAppleScript("tell application \"System Events\" to keystroke \"\(escapedAppleScriptString(key))\"\(modifiers)")
|
||||
}
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func scrollElement(_ params: JSON) throws -> JSON {
|
||||
guard let point = pointForElement(params) else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "scroll_element requires x/y or stateId + element_index"])
|
||||
}
|
||||
CGWarpMouseCursorPosition(point)
|
||||
let direction = params["direction"] as? String ?? "down"
|
||||
let pages = params["pages"] as? Double ?? 1.0
|
||||
let amount = Int32(max(1.0, abs(pages) * 8.0))
|
||||
let vertical = direction == "up" ? amount : direction == "down" ? -amount : 0
|
||||
let horizontal = direction == "left" ? amount : direction == "right" ? -amount : 0
|
||||
CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 2, wheel1: vertical, wheel2: horizontal)?.post(tap: .cghidEventTap)
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func drag(_ params: JSON) throws -> JSON {
|
||||
guard let fromX = params["from_x"] as? Double,
|
||||
let fromY = params["from_y"] as? Double,
|
||||
let toX = params["to_x"] as? Double,
|
||||
let toY = params["to_y"] as? Double
|
||||
else {
|
||||
throw NSError(domain: "CloudCLISemantics", code: 400, userInfo: [NSLocalizedDescriptionKey: "drag requires from_x/from_y/to_x/to_y"])
|
||||
}
|
||||
try postDrag(from: CGPoint(x: fromX, y: fromY), to: CGPoint(x: toX, y: toY), button: cgMouseButton(params["mouse_button"]))
|
||||
return try getAppState(params)
|
||||
}
|
||||
|
||||
func handle(_ request: JSON) {
|
||||
let id = request["id"]
|
||||
let method = request["method"] as? String ?? ""
|
||||
let params = request["params"] as? JSON ?? [:]
|
||||
|
||||
do {
|
||||
switch method {
|
||||
case "list_apps":
|
||||
respond(id: id, result: listApps())
|
||||
case "get_app_state":
|
||||
respond(id: id, result: try getAppState(params))
|
||||
case "click_element":
|
||||
respond(id: id, result: try click(params))
|
||||
case "perform_secondary_action":
|
||||
respond(id: id, result: try performSecondaryAction(params))
|
||||
case "set_value":
|
||||
respond(id: id, result: try setValue(params))
|
||||
case "type_text":
|
||||
respond(id: id, result: try typeText(params))
|
||||
case "press_key":
|
||||
respond(id: id, result: try pressKey(params))
|
||||
case "scroll_element":
|
||||
respond(id: id, result: try scrollElement(params))
|
||||
case "drag":
|
||||
respond(id: id, result: try drag(params))
|
||||
default:
|
||||
respondError(id: id, "Method is not implemented yet: \(method)")
|
||||
}
|
||||
} catch {
|
||||
respondError(id: id, error.localizedDescription)
|
||||
}
|
||||
}
|
||||
|
||||
while let line = readLine() {
|
||||
guard let data = line.data(using: .utf8),
|
||||
let object = try? JSONSerialization.jsonObject(with: data),
|
||||
let request = object as? JSON
|
||||
else {
|
||||
respondError(id: nil, "Invalid JSON request")
|
||||
continue
|
||||
}
|
||||
handle(request)
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import readline from 'node:readline';
|
||||
|
||||
type JsonRecord = Record<string, unknown>;
|
||||
|
||||
type PendingRequest = {
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (error: Error) => void;
|
||||
timer: ReturnType<typeof setTimeout>;
|
||||
};
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = Number.parseInt(process.env.CLOUDCLI_SEMANTICS_HELPER_TIMEOUT_MS || '60000', 10);
|
||||
|
||||
function timeoutMs(): number {
|
||||
return Number.isFinite(DEFAULT_TIMEOUT_MS) && DEFAULT_TIMEOUT_MS > 0 ? DEFAULT_TIMEOUT_MS : 60000;
|
||||
}
|
||||
|
||||
function errorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
export class SemanticHelperProcess {
|
||||
private child: ChildProcessWithoutNullStreams | null = null;
|
||||
private reader: readline.Interface | null = null;
|
||||
private nextId = 1;
|
||||
private pending = new Map<number, PendingRequest>();
|
||||
|
||||
constructor(private readonly executablePath: string) {}
|
||||
|
||||
async request(method: string, params: JsonRecord): Promise<unknown> {
|
||||
this.ensureStarted();
|
||||
const child = this.child;
|
||||
if (!child?.stdin.writable) {
|
||||
throw new Error('Semantic helper process is not running.');
|
||||
}
|
||||
|
||||
const id = this.nextId++;
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
this.pending.delete(id);
|
||||
reject(new Error(`Semantic helper request timed out: ${method}`));
|
||||
}, timeoutMs());
|
||||
this.pending.set(id, { resolve, reject, timer });
|
||||
child.stdin.write(`${JSON.stringify({ id, method, params })}\n`);
|
||||
});
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
const child = this.child;
|
||||
this.child = null;
|
||||
this.reader?.close();
|
||||
this.reader = null;
|
||||
this.rejectAll('Semantic helper stopped.');
|
||||
if (child) {
|
||||
try { child.kill('SIGTERM'); } catch { /* noop */ }
|
||||
}
|
||||
}
|
||||
|
||||
private ensureStarted(): void {
|
||||
if (this.child) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.child = spawn(this.executablePath, [], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
this.reader = readline.createInterface({ input: this.child.stdout });
|
||||
this.reader.on('line', (line) => this.handleLine(line));
|
||||
|
||||
this.child.stderr.on('data', (chunk) => {
|
||||
const text = String(chunk).trim();
|
||||
if (text) {
|
||||
console.error('[SemanticHelper]', text);
|
||||
}
|
||||
});
|
||||
|
||||
this.child.once('error', (error) => {
|
||||
this.child = null;
|
||||
this.rejectAll(`Failed to start semantic helper: ${error.message}`);
|
||||
});
|
||||
|
||||
this.child.once('exit', (code) => {
|
||||
this.child = null;
|
||||
this.rejectAll(`Semantic helper exited with code ${code ?? 'null'}.`);
|
||||
});
|
||||
}
|
||||
|
||||
private handleLine(line: string): void {
|
||||
let message: JsonRecord;
|
||||
try {
|
||||
message = JSON.parse(line) as JsonRecord;
|
||||
} catch (error) {
|
||||
console.error('[SemanticHelper] Invalid JSON response:', errorMessage(error));
|
||||
return;
|
||||
}
|
||||
|
||||
const id = typeof message.id === 'number' ? message.id : null;
|
||||
if (id === null) {
|
||||
return;
|
||||
}
|
||||
const pending = this.pending.get(id);
|
||||
if (!pending) {
|
||||
return;
|
||||
}
|
||||
clearTimeout(pending.timer);
|
||||
this.pending.delete(id);
|
||||
|
||||
if (message.error) {
|
||||
pending.reject(new Error(typeof message.error === 'string' ? message.error : 'Semantic helper request failed.'));
|
||||
return;
|
||||
}
|
||||
pending.resolve(message.result);
|
||||
}
|
||||
|
||||
private rejectAll(reason: string): void {
|
||||
for (const [id, request] of this.pending.entries()) {
|
||||
clearTimeout(request.timer);
|
||||
request.reject(new Error(reason));
|
||||
this.pending.delete(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
export type SemanticHelperPlatform = 'darwin' | 'win32';
|
||||
|
||||
export type SemanticHelperResolution = {
|
||||
available: boolean;
|
||||
path: string | null;
|
||||
source: 'bundled' | 'dev' | 'missing';
|
||||
platform: NodeJS.Platform;
|
||||
arch: NodeJS.Architecture;
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
function helperExecutableName(platform: NodeJS.Platform): string | null {
|
||||
if (platform === 'darwin') {
|
||||
return 'CloudCLISemantics';
|
||||
}
|
||||
if (platform === 'win32') {
|
||||
return 'CloudCLISemantics.exe';
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function pathExists(filePath: string): boolean {
|
||||
try {
|
||||
fs.accessSync(filePath, fs.constants.X_OK);
|
||||
return true;
|
||||
} catch {
|
||||
try {
|
||||
fs.accessSync(filePath, fs.constants.F_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function candidatePaths(platform: NodeJS.Platform, arch: NodeJS.Architecture): Array<{ source: 'bundled' | 'dev'; path: string }> {
|
||||
const executable = helperExecutableName(platform);
|
||||
if (!executable) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const platformArch = `${platform}-${arch}`;
|
||||
return [
|
||||
{
|
||||
source: 'bundled',
|
||||
path: path.resolve(__dirname, '..', 'bin', platformArch, executable),
|
||||
},
|
||||
{
|
||||
source: 'dev',
|
||||
path: path.resolve(process.cwd(), 'server', 'modules', 'computer-use', 'semantics', 'bin', platformArch, executable),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
export function resolveSemanticHelper(
|
||||
platform: NodeJS.Platform = process.platform,
|
||||
arch: NodeJS.Architecture = process.arch,
|
||||
): SemanticHelperResolution {
|
||||
const executable = helperExecutableName(platform);
|
||||
if (!executable) {
|
||||
return {
|
||||
available: false,
|
||||
path: null,
|
||||
source: 'missing',
|
||||
platform,
|
||||
arch,
|
||||
reason: `Semantic Computer Use helper is not supported on ${platform}.`,
|
||||
};
|
||||
}
|
||||
|
||||
for (const candidate of candidatePaths(platform, arch)) {
|
||||
if (pathExists(candidate.path)) {
|
||||
return {
|
||||
available: true,
|
||||
path: candidate.path,
|
||||
source: candidate.source,
|
||||
platform,
|
||||
arch,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
available: false,
|
||||
path: null,
|
||||
source: 'missing',
|
||||
platform,
|
||||
arch,
|
||||
reason: `Bundled semantic helper was not found for ${platform}-${arch} (${executable}).`,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0-windows</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<UseWindowsForms>true</UseWindowsForms>
|
||||
<AssemblyName>CloudCLISemantics</AssemblyName>
|
||||
</PropertyGroup>
|
||||
</Project>
|
||||
519
server/modules/computer-use/semantics/helpers/windows/Program.cs
Normal file
519
server/modules/computer-use/semantics/helpers/windows/Program.cs
Normal file
@@ -0,0 +1,519 @@
|
||||
using System.Diagnostics;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text.Json;
|
||||
using System.Windows.Automation;
|
||||
|
||||
static class Program
|
||||
{
|
||||
private static readonly Dictionary<string, List<ElementRecord>> StateElements = new();
|
||||
private static readonly Dictionary<string, Dictionary<string, AutomationElement>> StateAutomationElements = new();
|
||||
|
||||
public static void Main()
|
||||
{
|
||||
string? line;
|
||||
while ((line = Console.ReadLine()) != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(line);
|
||||
var root = doc.RootElement;
|
||||
var id = root.TryGetProperty("id", out var idValue) ? idValue.Clone() : default;
|
||||
var method = root.TryGetProperty("method", out var methodValue) ? methodValue.GetString() ?? "" : "";
|
||||
var parameters = root.TryGetProperty("params", out var paramsValue) && paramsValue.ValueKind == JsonValueKind.Object
|
||||
? paramsValue.Clone()
|
||||
: JsonDocument.Parse("{}").RootElement.Clone();
|
||||
|
||||
try
|
||||
{
|
||||
object result = method switch
|
||||
{
|
||||
"list_apps" => ListApps(),
|
||||
"get_app_state" => GetAppState(parameters),
|
||||
"click_element" => ClickElement(parameters),
|
||||
"perform_secondary_action" => PerformSecondaryAction(parameters),
|
||||
"set_value" => SetValue(parameters),
|
||||
"type_text" => TypeText(parameters),
|
||||
"press_key" => PressKey(parameters),
|
||||
"scroll_element" => ScrollElement(parameters),
|
||||
"drag" => Drag(parameters),
|
||||
_ => throw new InvalidOperationException($"Method is not implemented yet: {method}")
|
||||
};
|
||||
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["result"] = result });
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Write(new Dictionary<string, object?> { ["id"] = JsonValue(id), ["error"] = ex.Message });
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Write(new Dictionary<string, object?> { ["id"] = null, ["error"] = $"Invalid JSON request: {ex.Message}" });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static object? JsonValue(JsonElement element)
|
||||
{
|
||||
return element.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => element.GetString(),
|
||||
JsonValueKind.Number => element.TryGetInt64(out var number) ? number : element.GetDouble(),
|
||||
JsonValueKind.True => true,
|
||||
JsonValueKind.False => false,
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static void Write(object value)
|
||||
{
|
||||
Console.WriteLine(JsonSerializer.Serialize(value));
|
||||
Console.Out.Flush();
|
||||
}
|
||||
|
||||
private static List<Dictionary<string, object?>> ListApps()
|
||||
{
|
||||
return Process.GetProcesses()
|
||||
.Where(process => process.MainWindowHandle != IntPtr.Zero)
|
||||
.OrderBy(process => process.ProcessName)
|
||||
.Select(process => new Dictionary<string, object?>
|
||||
{
|
||||
["id"] = process.Id.ToString(),
|
||||
["name"] = process.ProcessName,
|
||||
["processName"] = process.ProcessName,
|
||||
["pid"] = process.Id,
|
||||
["running"] = true,
|
||||
["windowTitle"] = process.MainWindowTitle
|
||||
})
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static Process ResolveProcess(string query)
|
||||
{
|
||||
var normalized = query.Trim();
|
||||
if (string.IsNullOrWhiteSpace(normalized))
|
||||
{
|
||||
throw new InvalidOperationException("app is required.");
|
||||
}
|
||||
|
||||
var processes = Process.GetProcesses()
|
||||
.Where(process => process.MainWindowHandle != IntPtr.Zero)
|
||||
.ToList();
|
||||
|
||||
return processes.FirstOrDefault(process => process.ProcessName.Equals(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? processes.FirstOrDefault(process => process.MainWindowTitle.Equals(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? processes.FirstOrDefault(process => process.MainWindowTitle.Contains(normalized, StringComparison.OrdinalIgnoreCase))
|
||||
?? throw new InvalidOperationException($"App is not running: {query}");
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> GetAppState(JsonElement parameters)
|
||||
{
|
||||
var appQuery = ReadString(parameters, "app");
|
||||
var process = ResolveProcess(appQuery);
|
||||
var root = AutomationElement.FromHandle(process.MainWindowHandle)
|
||||
?? throw new InvalidOperationException("No UI Automation root window is available.");
|
||||
|
||||
var records = new List<ElementRecord>();
|
||||
var automationElements = new Dictionary<string, AutomationElement>();
|
||||
Walk(root, records, automationElements, 0, 5, 300);
|
||||
var stateId = $"state_{Guid.NewGuid()}";
|
||||
StateElements[stateId] = records;
|
||||
StateAutomationElements[stateId] = automationElements;
|
||||
|
||||
var elements = records.Select(record => record.ToDictionary()).ToList();
|
||||
var bounds = root.Current.BoundingRectangle;
|
||||
return new Dictionary<string, object?>
|
||||
{
|
||||
["stateId"] = stateId,
|
||||
["app"] = process.ProcessName,
|
||||
["platform"] = "win32",
|
||||
["screenshotDataUrl"] = CaptureScreen(),
|
||||
["displaySize"] = new Dictionary<string, object?>
|
||||
{
|
||||
["width"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Width,
|
||||
["height"] = (int)System.Windows.Forms.Screen.PrimaryScreen!.Bounds.Height
|
||||
},
|
||||
["window"] = new Dictionary<string, object?>
|
||||
{
|
||||
["title"] = process.MainWindowTitle,
|
||||
["bounds"] = BoundsDictionary(bounds)
|
||||
},
|
||||
["elements"] = elements,
|
||||
["accessibilityTree"] = elements,
|
||||
["treeText"] = string.Join("\n", elements.Select(element => $"{element["index"]} {element["role"]} {element.GetValueOrDefault("title")}"))
|
||||
};
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ClickElement(JsonElement parameters)
|
||||
{
|
||||
var mouseButton = ReadString(parameters, "mouse_button");
|
||||
if ((mouseButton == "" || mouseButton == "left") && ReadInt(parameters, "click_count", 1) == 1)
|
||||
{
|
||||
var element = AutomationElementFor(parameters);
|
||||
if (element != null && TryInvoke(element))
|
||||
{
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("click_element requires x/y or stateId + element_index.");
|
||||
}
|
||||
|
||||
SendMouseClick(point.Value.X, point.Value.Y, ReadString(parameters, "mouse_button"), ReadInt(parameters, "click_count", 1));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> PerformSecondaryAction(JsonElement parameters)
|
||||
{
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("perform_secondary_action requires x/y or stateId + element_index.");
|
||||
}
|
||||
|
||||
SendMouseClick(point.Value.X, point.Value.Y, "right", 1);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> SetValue(JsonElement parameters)
|
||||
{
|
||||
var value = ReadString(parameters, "value");
|
||||
var element = AutomationElementFor(parameters);
|
||||
var focused = false;
|
||||
if (element != null)
|
||||
{
|
||||
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var valuePattern))
|
||||
{
|
||||
((ValuePattern)valuePattern).SetValue(value);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
element.SetFocus();
|
||||
focused = true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Fall through to coordinate focus below.
|
||||
}
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point != null)
|
||||
{
|
||||
SendMouseClick(point.Value.X, point.Value.Y, "left", 1);
|
||||
focused = true;
|
||||
}
|
||||
else if (!focused && element == null)
|
||||
{
|
||||
throw new InvalidOperationException("set_value requires x/y or stateId + element_index.");
|
||||
}
|
||||
else if (!focused)
|
||||
{
|
||||
throw new InvalidOperationException("set_value could not focus the requested element.");
|
||||
}
|
||||
System.Windows.Forms.SendKeys.SendWait("^a");
|
||||
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(value));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> TypeText(JsonElement parameters)
|
||||
{
|
||||
var text = ReadString(parameters, "text");
|
||||
System.Windows.Forms.SendKeys.SendWait(EscapeSendKeys(text));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> PressKey(JsonElement parameters)
|
||||
{
|
||||
var key = ReadString(parameters, "key");
|
||||
System.Windows.Forms.SendKeys.SendWait(ToSendKeysChord(key));
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ScrollElement(JsonElement parameters)
|
||||
{
|
||||
var element = AutomationElementFor(parameters);
|
||||
var direction = ReadString(parameters, "direction");
|
||||
var pages = ReadDouble(parameters, "pages", 1);
|
||||
if (element != null && element.TryGetCurrentPattern(ScrollPattern.Pattern, out var scrollPatternValue))
|
||||
{
|
||||
var scrollPattern = (ScrollPattern)scrollPatternValue;
|
||||
var vertical = direction == "up" ? ScrollAmount.LargeDecrement : direction == "down" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
|
||||
var horizontal = direction == "left" ? ScrollAmount.LargeDecrement : direction == "right" ? ScrollAmount.LargeIncrement : ScrollAmount.NoAmount;
|
||||
scrollPattern.Scroll(horizontal, vertical);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
var point = PointFor(parameters);
|
||||
if (point == null)
|
||||
{
|
||||
throw new InvalidOperationException("scroll_element requires x/y or stateId + element_index.");
|
||||
}
|
||||
SetCursorPos(point.Value.X, point.Value.Y);
|
||||
var wheel = (int)Math.Round(Math.Max(1, pages) * 120);
|
||||
if (direction == "up") wheel = -wheel;
|
||||
mouse_event(0x0800, 0, 0, unchecked((uint)wheel), UIntPtr.Zero);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> Drag(JsonElement parameters)
|
||||
{
|
||||
var fromX = ReadDouble(parameters, "from_x", double.NaN);
|
||||
var fromY = ReadDouble(parameters, "from_y", double.NaN);
|
||||
var toX = ReadDouble(parameters, "to_x", double.NaN);
|
||||
var toY = ReadDouble(parameters, "to_y", double.NaN);
|
||||
if (double.IsNaN(fromX) || double.IsNaN(fromY) || double.IsNaN(toX) || double.IsNaN(toY))
|
||||
{
|
||||
throw new InvalidOperationException("drag requires from_x/from_y/to_x/to_y.");
|
||||
}
|
||||
|
||||
SetCursorPos((int)Math.Round(fromX), (int)Math.Round(fromY));
|
||||
mouse_event(0x0002, 0, 0, 0, UIntPtr.Zero);
|
||||
Thread.Sleep(80);
|
||||
SetCursorPos((int)Math.Round(toX), (int)Math.Round(toY));
|
||||
Thread.Sleep(80);
|
||||
mouse_event(0x0004, 0, 0, 0, UIntPtr.Zero);
|
||||
return GetAppState(parameters);
|
||||
}
|
||||
|
||||
private static void Walk(AutomationElement element, List<ElementRecord> records, Dictionary<string, AutomationElement> automationElements, int depth, int maxDepth, int limit)
|
||||
{
|
||||
if (depth > maxDepth || records.Count >= limit) return;
|
||||
var index = (records.Count + 1).ToString();
|
||||
records.Add(ElementRecord.From(element, index));
|
||||
automationElements[index] = element;
|
||||
var children = element.FindAll(TreeScope.Children, Condition.TrueCondition);
|
||||
foreach (AutomationElement child in children)
|
||||
{
|
||||
Walk(child, records, automationElements, depth + 1, maxDepth, limit);
|
||||
if (records.Count >= limit) return;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ReadString(JsonElement element, string property)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.ValueKind == JsonValueKind.String
|
||||
? value.GetString() ?? ""
|
||||
: "";
|
||||
}
|
||||
|
||||
private static int ReadInt(JsonElement element, string property, int defaultValue)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.TryGetInt32(out var number)
|
||||
? number
|
||||
: defaultValue;
|
||||
}
|
||||
|
||||
private static double ReadDouble(JsonElement element, string property, double defaultValue)
|
||||
{
|
||||
return element.TryGetProperty(property, out var value) && value.TryGetDouble(out var number)
|
||||
? number
|
||||
: defaultValue;
|
||||
}
|
||||
|
||||
private static AutomationElement? AutomationElementFor(JsonElement parameters)
|
||||
{
|
||||
var stateId = ReadString(parameters, "stateId");
|
||||
var elementIndex = ReadString(parameters, "element_index");
|
||||
return !string.IsNullOrWhiteSpace(stateId)
|
||||
&& !string.IsNullOrWhiteSpace(elementIndex)
|
||||
&& StateAutomationElements.TryGetValue(stateId, out var elements)
|
||||
&& elements.TryGetValue(elementIndex, out var element)
|
||||
? element
|
||||
: null;
|
||||
}
|
||||
|
||||
private static System.Drawing.Point? PointFor(JsonElement parameters)
|
||||
{
|
||||
if (parameters.TryGetProperty("x", out var xValue) && parameters.TryGetProperty("y", out var yValue)
|
||||
&& xValue.TryGetDouble(out var x) && yValue.TryGetDouble(out var y))
|
||||
{
|
||||
return new System.Drawing.Point((int)Math.Round(x), (int)Math.Round(y));
|
||||
}
|
||||
|
||||
var stateId = ReadString(parameters, "stateId");
|
||||
var elementIndex = ReadString(parameters, "element_index");
|
||||
if (string.IsNullOrWhiteSpace(stateId) || string.IsNullOrWhiteSpace(elementIndex)) return null;
|
||||
if (!StateElements.TryGetValue(stateId, out var elements)) return null;
|
||||
var element = elements.FirstOrDefault(item => item.Index == elementIndex);
|
||||
if (element?.Bounds == null) return null;
|
||||
return new System.Drawing.Point(
|
||||
(int)Math.Round(element.Bounds.Value.Left + element.Bounds.Value.Width / 2),
|
||||
(int)Math.Round(element.Bounds.Value.Top + element.Bounds.Value.Height / 2)
|
||||
);
|
||||
}
|
||||
|
||||
private static string CaptureScreen()
|
||||
{
|
||||
var bounds = System.Windows.Forms.Screen.PrimaryScreen!.Bounds;
|
||||
using var bitmap = new Bitmap(bounds.Width, bounds.Height);
|
||||
using var graphics = Graphics.FromImage(bitmap);
|
||||
graphics.CopyFromScreen(bounds.Left, bounds.Top, 0, 0, bounds.Size);
|
||||
using var stream = new MemoryStream();
|
||||
bitmap.Save(stream, ImageFormat.Png);
|
||||
return $"data:image/png;base64,{Convert.ToBase64String(stream.ToArray())}";
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> BoundsDictionary(System.Windows.Rect rect)
|
||||
{
|
||||
return new Dictionary<string, object?>
|
||||
{
|
||||
["x"] = rect.X,
|
||||
["y"] = rect.Y,
|
||||
["width"] = rect.Width,
|
||||
["height"] = rect.Height
|
||||
};
|
||||
}
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
private static extern bool SetCursorPos(int x, int y);
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
private static extern void mouse_event(uint dwFlags, uint dx, uint dy, uint dwData, UIntPtr dwExtraInfo);
|
||||
|
||||
private static void SendMouseClick(int x, int y, string button, int clickCount)
|
||||
{
|
||||
var (down, up) = button switch
|
||||
{
|
||||
"right" => (0x0008u, 0x0010u),
|
||||
"middle" => (0x0020u, 0x0040u),
|
||||
_ => (0x0002u, 0x0004u)
|
||||
};
|
||||
SetCursorPos(x, y);
|
||||
for (var i = 0; i < Math.Max(1, clickCount); i++)
|
||||
{
|
||||
mouse_event(down, 0, 0, 0, UIntPtr.Zero);
|
||||
mouse_event(up, 0, 0, 0, UIntPtr.Zero);
|
||||
Thread.Sleep(80);
|
||||
}
|
||||
}
|
||||
|
||||
private static bool TryInvoke(AutomationElement element)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (!element.TryGetCurrentPattern(InvokePattern.Pattern, out var pattern)) return false;
|
||||
((InvokePattern)pattern).Invoke();
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static string EscapeSendKeys(string value)
|
||||
{
|
||||
return value
|
||||
.Replace("{", "{{}")
|
||||
.Replace("}", "{}}")
|
||||
.Replace("+", "{+}")
|
||||
.Replace("^", "{^}")
|
||||
.Replace("%", "{%}")
|
||||
.Replace("~", "{~}")
|
||||
.Replace("(", "{(}")
|
||||
.Replace(")", "{)}")
|
||||
.Replace("[", "{[}")
|
||||
.Replace("]", "{]}");
|
||||
}
|
||||
|
||||
private static string ToSendKeysChord(string key)
|
||||
{
|
||||
var normalized = key.Trim();
|
||||
if (normalized.Contains('+'))
|
||||
{
|
||||
var parts = normalized.Split('+', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
var modifiers = "";
|
||||
var last = parts.LastOrDefault() ?? "";
|
||||
foreach (var part in parts.Take(parts.Length - 1))
|
||||
{
|
||||
modifiers += part.ToLowerInvariant() switch
|
||||
{
|
||||
"ctrl" or "control" => "^",
|
||||
"alt" => "%",
|
||||
"shift" => "+",
|
||||
"cmd" or "win" or "windows" => "^",
|
||||
_ => ""
|
||||
};
|
||||
}
|
||||
return modifiers + SendKeyName(last);
|
||||
}
|
||||
return SendKeyName(normalized);
|
||||
}
|
||||
|
||||
private static string SendKeyName(string key)
|
||||
{
|
||||
return key.ToLowerInvariant() switch
|
||||
{
|
||||
"return" or "enter" => "{ENTER}",
|
||||
"escape" or "esc" => "{ESC}",
|
||||
"tab" => "{TAB}",
|
||||
"backspace" => "{BACKSPACE}",
|
||||
"delete" or "del" => "{DELETE}",
|
||||
"left" => "{LEFT}",
|
||||
"right" => "{RIGHT}",
|
||||
"up" => "{UP}",
|
||||
"down" => "{DOWN}",
|
||||
"space" => " ",
|
||||
_ => key.Length == 1 ? EscapeSendKeys(key) : $"{{{key.ToUpperInvariant()}}}"
|
||||
};
|
||||
}
|
||||
|
||||
private sealed record ElementRecord(
|
||||
string Index,
|
||||
string Role,
|
||||
string? Title,
|
||||
string? Value,
|
||||
System.Windows.Rect? Bounds,
|
||||
List<string> Actions)
|
||||
{
|
||||
public static ElementRecord From(AutomationElement element, string index)
|
||||
{
|
||||
var patterns = element.GetSupportedPatterns().Select(pattern => pattern.ProgrammaticName).ToList();
|
||||
return new ElementRecord(
|
||||
index,
|
||||
element.Current.ControlType.ProgrammaticName.Replace("ControlType.", ""),
|
||||
element.Current.Name,
|
||||
TryValue(element),
|
||||
element.Current.BoundingRectangle,
|
||||
patterns
|
||||
);
|
||||
}
|
||||
|
||||
public Dictionary<string, object?> ToDictionary()
|
||||
{
|
||||
var output = new Dictionary<string, object?>
|
||||
{
|
||||
["index"] = Index,
|
||||
["role"] = Role,
|
||||
["actions"] = Actions
|
||||
};
|
||||
if (!string.IsNullOrEmpty(Title)) output["title"] = Title;
|
||||
if (!string.IsNullOrEmpty(Value)) output["value"] = Value;
|
||||
if (Bounds != null) output["bounds"] = BoundsDictionary(Bounds.Value);
|
||||
return output;
|
||||
}
|
||||
|
||||
private static string? TryValue(AutomationElement element)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (element.TryGetCurrentPattern(ValuePattern.Pattern, out var pattern))
|
||||
{
|
||||
return ((ValuePattern)pattern).Current.Value;
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
import type { SemanticAppState, SemanticElement } from '@/modules/computer-use/semantics/semantic-types.js';
|
||||
|
||||
const DEFAULT_STATE_TTL_MS = Number.parseInt(process.env.CLOUDCLI_COMPUTER_SEMANTIC_STATE_TTL_MS || String(10 * 60 * 1000), 10);
|
||||
|
||||
type StoredState = {
|
||||
sessionId: string;
|
||||
appKey: string;
|
||||
state: SemanticAppState;
|
||||
updatedAt: number;
|
||||
};
|
||||
|
||||
function normalizeAppKey(app: string): string {
|
||||
return app.trim().toLowerCase();
|
||||
}
|
||||
|
||||
export class SemanticSessionStore {
|
||||
private states = new Map<string, StoredState>();
|
||||
private latestBySessionApp = new Map<string, string>();
|
||||
|
||||
createStateId(): string {
|
||||
return `state_${randomUUID()}`;
|
||||
}
|
||||
|
||||
save(sessionId: string, state: SemanticAppState): SemanticAppState {
|
||||
const appKey = normalizeAppKey(state.app);
|
||||
const nextState = {
|
||||
...state,
|
||||
stateId: state.stateId || this.createStateId(),
|
||||
};
|
||||
this.states.set(nextState.stateId, {
|
||||
sessionId,
|
||||
appKey,
|
||||
state: nextState,
|
||||
updatedAt: Date.now(),
|
||||
});
|
||||
this.latestBySessionApp.set(this.latestKey(sessionId, appKey), nextState.stateId);
|
||||
return nextState;
|
||||
}
|
||||
|
||||
getState(sessionId: string, app: string, stateId?: string): SemanticAppState | null {
|
||||
this.expire();
|
||||
if (stateId) {
|
||||
const entry = this.states.get(stateId);
|
||||
return entry && entry.sessionId === sessionId ? entry.state : null;
|
||||
}
|
||||
const latestStateId = this.latestBySessionApp.get(this.latestKey(sessionId, normalizeAppKey(app)));
|
||||
return latestStateId ? this.states.get(latestStateId)?.state || null : null;
|
||||
}
|
||||
|
||||
getElement(sessionId: string, app: string, elementIndex: string, stateId?: string): SemanticElement | null {
|
||||
const state = this.getState(sessionId, app, stateId);
|
||||
return state?.elements.find((element) => element.index === elementIndex) || null;
|
||||
}
|
||||
|
||||
clearSession(sessionId: string): void {
|
||||
for (const [stateId, entry] of this.states.entries()) {
|
||||
if (entry.sessionId === sessionId) {
|
||||
this.states.delete(stateId);
|
||||
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
expire(now = Date.now()): void {
|
||||
const ttl = Number.isFinite(DEFAULT_STATE_TTL_MS) && DEFAULT_STATE_TTL_MS > 0
|
||||
? DEFAULT_STATE_TTL_MS
|
||||
: 10 * 60 * 1000;
|
||||
for (const [stateId, entry] of this.states.entries()) {
|
||||
if (now - entry.updatedAt > ttl) {
|
||||
this.states.delete(stateId);
|
||||
this.latestBySessionApp.delete(this.latestKey(entry.sessionId, entry.appKey));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private latestKey(sessionId: string, appKey: string): string {
|
||||
return `${sessionId}:${appKey}`;
|
||||
}
|
||||
}
|
||||
|
||||
export const semanticSessionStore = new SemanticSessionStore();
|
||||
@@ -0,0 +1,17 @@
|
||||
export const semanticMcpToolMap: Record<string, string> = {
|
||||
computer_app_drag: 'drag',
|
||||
computer_click_element: 'click',
|
||||
computer_get_app_state: 'get_app_state',
|
||||
computer_list_apps: 'list_apps',
|
||||
computer_perform_secondary_action: 'perform_secondary_action',
|
||||
computer_press_key: 'press_key',
|
||||
computer_scroll_element: 'scroll',
|
||||
computer_set_value: 'set_value',
|
||||
computer_type_text: 'type_text',
|
||||
};
|
||||
|
||||
export const semanticOperationNames = new Set(Object.values(semanticMcpToolMap));
|
||||
|
||||
export function semanticOperationForMcpTool(toolName: string): string | null {
|
||||
return semanticMcpToolMap[toolName] || null;
|
||||
}
|
||||
58
server/modules/computer-use/semantics/semantic-types.ts
Normal file
58
server/modules/computer-use/semantics/semantic-types.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import type { DisplaySize, Point } from '@/modules/computer-use/computer-executor.js';
|
||||
|
||||
export type SemanticBounds = {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
|
||||
export type SemanticApp = {
|
||||
id?: string;
|
||||
name: string;
|
||||
bundleIdentifier?: string;
|
||||
processName?: string;
|
||||
pid?: number;
|
||||
running: boolean;
|
||||
windowTitle?: string;
|
||||
};
|
||||
|
||||
export type SemanticElement = {
|
||||
index: string;
|
||||
role: string;
|
||||
title?: string;
|
||||
value?: string;
|
||||
description?: string;
|
||||
enabled?: boolean;
|
||||
focused?: boolean;
|
||||
selected?: boolean;
|
||||
bounds?: SemanticBounds;
|
||||
actions?: string[];
|
||||
settableValue?: boolean;
|
||||
};
|
||||
|
||||
export type SemanticAppState = {
|
||||
stateId: string;
|
||||
app: string;
|
||||
platform: NodeJS.Platform;
|
||||
screenshotDataUrl: string | null;
|
||||
displaySize: DisplaySize | null;
|
||||
elements: SemanticElement[];
|
||||
accessibilityTree: SemanticElement[];
|
||||
treeText?: string;
|
||||
message?: string;
|
||||
};
|
||||
|
||||
export type SemanticToolInput = Record<string, unknown> & {
|
||||
sessionId?: string;
|
||||
app?: string;
|
||||
stateId?: string;
|
||||
element_index?: string;
|
||||
};
|
||||
|
||||
export type SemanticToolResult = SemanticAppState | {
|
||||
apps: SemanticApp[];
|
||||
platform: NodeJS.Platform;
|
||||
};
|
||||
|
||||
export type SemanticActionPoint = Point;
|
||||
Reference in New Issue
Block a user