diff --git a/apps/operator/package.json b/apps/operator/package.json index c3cb05b..add0c15 100644 --- a/apps/operator/package.json +++ b/apps/operator/package.json @@ -19,6 +19,7 @@ "@repo/logger": "workspace:*", "drizzle-orm": "0.45.2", "hono": "4.12.9", + "node-html-markdown": "2.0.0", "openai": "6.33.0", "zod": "4.3.6" }, diff --git a/apps/operator/src/modules/telegram/controller.ts b/apps/operator/src/modules/telegram/controller.ts index ef51ec7..8665ebe 100644 --- a/apps/operator/src/modules/telegram/controller.ts +++ b/apps/operator/src/modules/telegram/controller.ts @@ -14,6 +14,7 @@ import { TelegramService } from "../../services/telegram"; import type { AppEnv } from "../../types/env"; import type { TelegramUpdate } from "../../types/telegram"; import { splitMessage } from "../../utils/message"; +import { validateSourceUrl } from "../../utils/url-validator"; const DAYS = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; @@ -200,9 +201,11 @@ const handleWebhook = async (c: Context) => { return { error: validation.error.message }; } - // Monitor support (source_url) is not yet implemented — reject if (input.sourceUrl) { - return { error: "URL monitors are not yet supported" }; + const urlCheck = validateSourceUrl(input.sourceUrl); + if (!urlCheck.valid) { + return { error: urlCheck.reason }; + } } // Store as pending in D1 — require user confirmation diff --git a/apps/operator/src/scheduled.ts b/apps/operator/src/scheduled.ts index ebea7c3..8c4e38a 100644 --- a/apps/operator/src/scheduled.ts +++ b/apps/operator/src/scheduled.ts @@ -2,9 +2,11 @@ import { Logger } from "@repo/logger"; import { OpenAiService } from "./services/openai"; import { ScheduleService } from "./services/schedule"; +import { scrapeUrl } from "./services/scrape"; import { TelegramService } from "./services/telegram"; import type { AppEnv } from "./types/env"; import { splitMessage } from "./utils/message"; +import { validateSourceUrl } from "./utils/url-validator"; type Env = AppEnv["Bindings"]; @@ -41,11 +43,73 @@ const handleScheduled = async ( const results = await Promise.allSettled( claimed.map(async (schedule) => { - // Monitor path (Stage 2 — stub) + // Monitor path if (schedule.sourceUrl) { - logger.info("monitor execution not yet implemented, skipping", { - scheduleId: schedule.id, + const urlCheck = validateSourceUrl(schedule.sourceUrl); + if (!urlCheck.valid) { + logger.error("monitor URL validation failed at execution", { + scheduleId: schedule.id, + reason: urlCheck.reason, + }); + return; + } + + const scrapeResult = await scrapeUrl(schedule.sourceUrl); + if (!scrapeResult.ok) { + throw new Error(`Scrape failed: ${scrapeResult.error}`); + } + + let previousState: string | null = null; + if (schedule.stateJson) { + try { + const parsed = JSON.parse(schedule.stateJson) as Record< + string, + unknown + >; + previousState = + typeof parsed.lastContent === "string" + ? parsed.lastContent + : null; + } catch { + logger.warn("malformed stateJson, treating as first run", { + scheduleId: schedule.id, + }); + } + } + + let scrapedContent = scrapeResult.text; + if (scrapeResult.truncated) { + scrapedContent += + "\n\n[Note: Page content was truncated. Analysis is based on partial content.]"; + } + + const openai = new OpenAiService(env.OPENAI_API_KEY, logger); + const analysis = await openai.analyzeMonitor({ + task: schedule.messagePrompt ?? schedule.description, + scrapedContent, + previousState, }); + + if (analysis.notify) { + for (const chunk of splitMessage(analysis.message)) { + await telegram.sendMessage({ chat_id: chatId, text: chunk }); + } + logger.info("monitor notification sent", { + scheduleId: schedule.id, + }); + } else { + logger.info("monitor check — no notification needed", { + scheduleId: schedule.id, + }); + } + + await scheduleService.updateState( + schedule.id, + JSON.stringify({ + lastContent: analysis.newState, + lastScrapedAt: now.toISOString(), + }) + ); return; } diff --git a/apps/operator/src/services/openai.test.ts b/apps/operator/src/services/openai.test.ts index edf6ce8..85d6556 100644 --- a/apps/operator/src/services/openai.test.ts +++ b/apps/operator/src/services/openai.test.ts @@ -280,4 +280,144 @@ describe("OpenAiService", () => { expect(executor).not.toHaveBeenCalled(); }); }); + + describe("analyzeMonitor", () => { + it("returns parsed monitor analysis", async () => { + const analysis = { + notify: true, + message: "Seinfeld is on TV at 20:00 on MTV3", + newState: "Current listings include Seinfeld at 20:00", + }; + createMock.mockResolvedValueOnce({ + choices: [{ message: { content: JSON.stringify(analysis) } }], + }); + + const result = await service.analyzeMonitor({ + task: "Check if Seinfeld is on today", + scrapedContent: "# TV Listings\n- 20:00 Seinfeld (MTV3)", + previousState: null, + }); + + expect(result).toEqual(analysis); + }); + + it("sends structured request with json_object format", async () => { + createMock.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + notify: false, + message: "", + newState: "no changes", + }), + }, + }, + ], + }); + + await service.analyzeMonitor({ + task: "Check for changes", + scrapedContent: "content", + previousState: "old state", + }); + + expect(createMock).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: { type: "json_object" }, + max_completion_tokens: 4096, + }) + ); + }); + + it("includes previous state in user message", async () => { + createMock.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + notify: true, + message: "Changed", + newState: "new", + }), + }, + }, + ], + }); + + await service.analyzeMonitor({ + task: "Check diff", + scrapedContent: "new content", + previousState: "old content summary", + }); + + const callArgs = createMock.mock.calls[0] as [ + { messages: { content: string }[] }, + ]; + const userMsg = callArgs[0].messages[1].content; + expect(userMsg).toContain("old content summary"); + }); + + it("uses placeholder when no previous state", async () => { + createMock.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + notify: true, + message: "First check", + newState: "initial", + }), + }, + }, + ], + }); + + await service.analyzeMonitor({ + task: "Monitor page", + scrapedContent: "content", + previousState: null, + }); + + const callArgs = createMock.mock.calls[0] as [ + { messages: { content: string }[] }, + ]; + const userMsg = callArgs[0].messages[1].content; + expect(userMsg).toContain("First check"); + }); + + it("throws on empty response", async () => { + createMock.mockResolvedValueOnce({ + choices: [{ message: { content: null } }], + }); + + await expect( + service.analyzeMonitor({ + task: "test", + scrapedContent: "content", + previousState: null, + }) + ).rejects.toThrow("OpenAI returned empty response for monitor analysis"); + }); + + it("throws on invalid JSON structure", async () => { + createMock.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ wrong: "structure" }), + }, + }, + ], + }); + + await expect( + service.analyzeMonitor({ + task: "test", + scrapedContent: "content", + previousState: null, + }) + ).rejects.toThrow(); + }); + }); }); diff --git a/apps/operator/src/services/openai.ts b/apps/operator/src/services/openai.ts index 759453d..eef2ef0 100644 --- a/apps/operator/src/services/openai.ts +++ b/apps/operator/src/services/openai.ts @@ -1,6 +1,7 @@ import type { Logger } from "@repo/logger"; import OpenAI from "openai"; import type { ChatCompletionTool } from "openai/resources/chat/completions"; +import { z } from "zod"; const SYSTEM_PROMPT = `You are a helpful personal assistant called Switch Operator. Be concise and helpful. @@ -16,6 +17,14 @@ Schedule types: Use fixed_message for exact text or message_prompt for AI-generated content. +You can also create web monitors that scrape a URL on a schedule and notify based on conditions. +When the user wants to monitor a website for changes or check for specific content, use create_schedule with source_url + message_prompt. +The message_prompt should describe what to look for or how to analyze the page content. + +Monitor examples: +- "Notify me when a specific show is on TV" → source_url with the TV listings page, message_prompt: "Check if [show name] appears in today's listings. Notify with channel and time if found." +- "Weekly report changes" → source_url with the report page, message_prompt: "Compare this week's content to last week. Summarize key changes." + When listing schedules, format them as a numbered list (1, 2, 3...) with key details like description, type, time, and next run. When the user asks to delete a schedule by number, first call list_schedules to get the current list, then use the ID from the matching position to call delete_schedule.`; @@ -64,6 +73,11 @@ const SCHEDULE_TOOLS: ChatCompletionTool[] = [ description: "Prompt for AI-generated message. Mutually exclusive with fixed_message.", }, + source_url: { + type: "string", + description: + "URL to monitor/scrape. When set, the schedule becomes a monitor: it will fetch this URL on each run, analyze the content using message_prompt, and notify only if the condition is met. Requires message_prompt. Cannot be used with fixed_message.", + }, description: { type: "string", description: "Short description of this schedule (max 200 chars).", @@ -216,7 +230,73 @@ class OpenAiService { throw new Error("Tool calling exceeded maximum iterations"); } + + async analyzeMonitor(params: { + task: string; + scrapedContent: string; + previousState: string | null; + }): Promise { + this.logger.debug("analyzing monitor", { + taskLength: params.task.length, + contentLength: params.scrapedContent.length, + hasPreviousState: params.previousState != null, + }); + + const previousStateText = + params.previousState ?? "First check — no previous state."; + + const response = await this.client.chat.completions.create({ + model: "gpt-5.4-mini", + max_completion_tokens: 4096, + response_format: { type: "json_object" }, + messages: [ + { role: "system", content: MONITOR_ANALYSIS_PROMPT }, + { + role: "user", + content: `## Task\n${params.task}\n\n## Current page content\n${params.scrapedContent}\n\n## Previous state\n${previousStateText}`, + }, + ], + }); + + const content = response.choices[0]?.message.content; + if (!content) { + throw new Error("OpenAI returned empty response for monitor analysis"); + } + + const parsed: unknown = JSON.parse(content); + return monitorAnalysisSchema.parse(parsed); + } } +const MONITOR_ANALYSIS_PROMPT = `You are analyzing a web page for a monitoring task. + +The user will provide: +1. A task describing what to check or monitor +2. The current page content (scraped and converted to markdown) +3. Previous state from the last check (or "First check" if this is the first run) + +Respond in JSON with exactly these fields: +{ + "notify": true or false, + "message": "notification message to send to the user (max 4000 chars, use markdown formatting)", + "newState": "concise summary of current state for comparison next time (max 5000 chars)" +} + +Rules: +- Only set "notify" to true if the condition described in the task is met +- For diff/change detection tasks: compare current content to previous state and summarize what changed. Notify if there are meaningful changes. +- For condition check tasks: evaluate whether the specific condition is satisfied. Notify only if it is. +- The "message" should be informative and actionable — include relevant details from the page +- The "newState" should contain enough information to compare against next time. Keep it concise. +- If this is the first check, always set notify to true with a summary of current state`; + +const monitorAnalysisSchema = z.object({ + notify: z.boolean(), + message: z.string().max(4000), + newState: z.string().max(5000), +}); + +type MonitorAnalysis = z.infer; + export { MAX_TOOL_ITERATIONS, OpenAiService, SCHEDULE_TOOLS }; -export type { ToolExecutor, ToolResult }; +export type { MonitorAnalysis, ToolExecutor, ToolResult }; diff --git a/apps/operator/src/services/scrape.test.ts b/apps/operator/src/services/scrape.test.ts new file mode 100644 index 0000000..b121c64 --- /dev/null +++ b/apps/operator/src/services/scrape.test.ts @@ -0,0 +1,351 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +import { collapseWhitespace, convertContent, scrapeUrl } from "./scrape"; + +const createMockResponse = ( + body: string, + init?: { status?: number; headers?: Record } +): Response => + new Response(body, { + status: init?.status ?? 200, + headers: init?.headers ?? { "content-type": "text/html" }, + }); + +describe("scrapeUrl", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("converts HTML response to markdown", async () => { + vi.stubGlobal( + "fetch", + vi + .fn() + .mockResolvedValueOnce(createMockResponse("

Hello

World

")) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text).toContain("Hello"); + expect(result.text).toContain("World"); + expect(result.truncated).toBe(false); + } + }); + + it("handles JSON response with pretty-print", async () => { + const json = JSON.stringify({ key: "value", nested: { a: 1 } }); + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce( + createMockResponse(json, { + headers: { "content-type": "application/json" }, + }) + ) + ); + + const result = await scrapeUrl("https://api.example.com/data"); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text).toContain("```json"); + expect(result.text).toContain('"key": "value"'); + expect(result.text).toContain("```"); + } + }); + + it("handles plain text response", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce( + createMockResponse("Plain text content", { + headers: { "content-type": "text/plain" }, + }) + ) + ); + + const result = await scrapeUrl("https://example.com/robots.txt"); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text).toBe("Plain text content"); + } + }); + + it("returns error for unsupported content type", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce( + createMockResponse("binary", { + headers: { "content-type": "application/octet-stream" }, + }) + ) + ); + + const result = await scrapeUrl("https://example.com/file.bin"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toContain("Unsupported content type"); + } + }); + + it("returns error for non-ok HTTP status", async () => { + vi.stubGlobal( + "fetch", + vi + .fn() + .mockResolvedValueOnce(createMockResponse("Forbidden", { status: 403 })) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Blocked by site"); + expect(result.statusCode).toBe(403); + } + }); + + it("returns error for 404", async () => { + vi.stubGlobal( + "fetch", + vi + .fn() + .mockResolvedValueOnce(createMockResponse("Not found", { status: 404 })) + ); + + const result = await scrapeUrl("https://example.com/missing"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Page not found"); + expect(result.statusCode).toBe(404); + } + }); + + it("returns generic error for unmapped status codes", async () => { + vi.stubGlobal( + "fetch", + vi + .fn() + .mockResolvedValueOnce(createMockResponse("Error", { status: 502 })) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("HTTP 502"); + } + }); + + it("truncates text exceeding maxTextLength", async () => { + const longHtml = `

${"a".repeat(1000)}

`; + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce(createMockResponse(longHtml)) + ); + + const result = await scrapeUrl("https://example.com", 100); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text.length).toBeLessThanOrEqual(100); + expect(result.truncated).toBe(true); + } + }); + + it("returns error on fetch failure", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockRejectedValueOnce(new Error("Network error")) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Network error"); + } + }); + + it("returns error on timeout", async () => { + const timeoutError = new DOMException("Timeout", "TimeoutError"); + vi.stubGlobal("fetch", vi.fn().mockRejectedValueOnce(timeoutError)); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Request timed out"); + } + }); + + it("follows safe redirects", async () => { + const mockFetch = vi + .fn() + .mockResolvedValueOnce( + new Response(null, { + status: 302, + headers: { location: "https://example.com/final" }, + }) + ) + .mockResolvedValueOnce(createMockResponse("

Redirected

")); + vi.stubGlobal("fetch", mockFetch); + + const result = await scrapeUrl("https://example.com/start"); + + expect(result.ok).toBe(true); + if (result.ok) { + expect(result.text).toContain("Redirected"); + } + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("rejects redirects to localhost", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce( + new Response(null, { + status: 302, + headers: { location: "https://127.0.0.1/secret" }, + }) + ) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toContain("Redirect to unsafe URL"); + } + }); + + it("rejects redirects to HTTP", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce( + new Response(null, { + status: 301, + headers: { location: "http://example.com/page" }, + }) + ) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toContain("Redirect to unsafe URL"); + } + }); + + it("returns error on too many redirects", async () => { + const redirectResponse = new Response(null, { + status: 302, + headers: { location: "https://example.com/loop" }, + }); + vi.stubGlobal("fetch", vi.fn().mockResolvedValue(redirectResponse)); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Too many redirects"); + } + }); + + it("returns error when response body exceeds 2MB", async () => { + const largeBody = "x".repeat(3 * 1024 * 1024); + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValueOnce(createMockResponse(largeBody)) + ); + + const result = await scrapeUrl("https://example.com"); + + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error).toBe("Response exceeds 2MB size limit"); + } + }); +}); + +describe("convertContent", () => { + it("converts HTML to markdown", () => { + const result = convertContent( + "

Title

Paragraph

", + "text/html" + ); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toContain("Title"); + expect(result.text).toContain("Paragraph"); + } + }); + + it("handles application/xhtml+xml as HTML", () => { + const result = convertContent("

Content

", "application/xhtml+xml"); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toContain("Content"); + } + }); + + it("pretty-prints JSON", () => { + const result = convertContent('{"a":1}', "application/json"); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toContain("```json"); + expect(result.text).toContain('"a": 1'); + } + }); + + it("handles +json content types", () => { + const result = convertContent( + '{"data":"test"}', + "application/vnd.api+json" + ); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toContain("```json"); + } + }); + + it("wraps invalid JSON in plain code block", () => { + const result = convertContent("not json", "application/json"); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toContain("```\nnot json\n```"); + } + }); + + it("passes plain text through", () => { + const result = convertContent("hello world", "text/plain"); + expect("text" in result).toBe(true); + if ("text" in result) { + expect(result.text).toBe("hello world"); + } + }); + + it("returns error for unsupported types", () => { + const result = convertContent("data", "application/octet-stream"); + expect("error" in result).toBe(true); + }); +}); + +describe("collapseWhitespace", () => { + it("collapses 3+ newlines to double", () => { + expect(collapseWhitespace("a\n\n\n\nb")).toBe("a\n\nb"); + }); + + it("collapses multiple spaces to single", () => { + expect(collapseWhitespace("a b")).toBe("a b"); + }); + + it("trims leading and trailing whitespace", () => { + expect(collapseWhitespace(" hello ")).toBe("hello"); + }); +}); diff --git a/apps/operator/src/services/scrape.ts b/apps/operator/src/services/scrape.ts new file mode 100644 index 0000000..b033c42 --- /dev/null +++ b/apps/operator/src/services/scrape.ts @@ -0,0 +1,205 @@ +import { NodeHtmlMarkdown } from "node-html-markdown"; + +import { validateSourceUrl } from "../utils/url-validator"; + +type ScrapeOk = { ok: true; text: string; truncated: boolean }; +type ScrapeError = { ok: false; error: string; statusCode?: number }; +type ScrapeResult = ScrapeOk | ScrapeError; + +const MAX_BODY_BYTES = 2 * 1024 * 1024; // 2 MB +const DEFAULT_MAX_TEXT_LENGTH = 80_000; +const FETCH_TIMEOUT_MS = 15_000; +const MAX_REDIRECTS = 5; + +const USER_AGENT = "SwitchOperator/1.0"; + +const ERROR_MAP: Partial> = { + 403: "Blocked by site", + 404: "Page not found", + 429: "Rate limited by site", +}; + +const isHtml = (contentType: string): boolean => + contentType.includes("text/html") || + contentType.includes("application/xhtml+xml"); + +const isJson = (contentType: string): boolean => + contentType.includes("application/json") || contentType.includes("+json"); + +const isPlainText = (contentType: string): boolean => + contentType.includes("text/plain"); + +const concatChunks = (chunks: Uint8Array[], totalBytes: number): Uint8Array => { + const result = new Uint8Array(totalBytes); + let offset = 0; + for (const chunk of chunks) { + result.set(chunk, offset); + offset += chunk.byteLength; + } + return result; +}; + +const readBodyWithLimit = async ( + body: ReadableStream +): Promise<{ bytes: Uint8Array } | { error: string }> => { + const reader = body.getReader(); + const chunks: Uint8Array[] = []; + let totalBytes = 0; + + try { + for (;;) { + const { done, value } = await reader.read(); + if (done) { + break; + } + totalBytes += value.byteLength; + if (totalBytes > MAX_BODY_BYTES) { + await reader.cancel(); + return { error: "Response exceeds 2MB size limit" }; + } + chunks.push(value); + } + } finally { + reader.releaseLock(); + } + + return { bytes: concatChunks(chunks, totalBytes) }; +}; + +const collapseWhitespace = (text: string): string => + text + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); + +const convertContent = ( + raw: string, + rawContentType: string +): { text: string } | { error: string } => { + const contentType = rawContentType.toLowerCase(); + if (isHtml(contentType)) { + const markdown = NodeHtmlMarkdown.translate(raw); + return { text: collapseWhitespace(markdown) }; + } + + if (isJson(contentType)) { + try { + const parsed: unknown = JSON.parse(raw); + const pretty = JSON.stringify(parsed, null, 2); + return { text: `\`\`\`json\n${pretty}\n\`\`\`` }; + } catch { + return { text: `\`\`\`\n${raw}\n\`\`\`` }; + } + } + + if (isPlainText(contentType)) { + return { text: collapseWhitespace(raw) }; + } + + return { error: `Unsupported content type: ${contentType}` }; +}; + +const isRedirect = (status: number): boolean => + status === 301 || + status === 302 || + status === 303 || + status === 307 || + status === 308; + +const fetchWithSafeRedirects = async ( + initialUrl: string +): Promise => { + let currentUrl = initialUrl; + + for (let i = 0; i <= MAX_REDIRECTS; i++) { + const response = await fetch(currentUrl, { + headers: { "User-Agent": USER_AGENT }, + signal: AbortSignal.timeout(FETCH_TIMEOUT_MS), + redirect: "manual", + }); + + if (!isRedirect(response.status)) { + return response; + } + + const location = response.headers.get("location"); + if (!location) { + return response; + } + + const resolved = new URL(location, currentUrl).toString(); + const check = validateSourceUrl(resolved); + if (!check.valid) { + throw new Error(`Redirect to unsafe URL: ${check.reason}`); + } + + currentUrl = resolved; + } + + throw new Error("Too many redirects"); +}; + +const scrapeUrl = async ( + url: string, + maxTextLength = DEFAULT_MAX_TEXT_LENGTH +): Promise => { + let response: Response; + try { + response = await fetchWithSafeRedirects(url); + } catch (error) { + if (error instanceof DOMException && error.name === "TimeoutError") { + return { ok: false, error: "Request timed out" }; + } + return { + ok: false, + error: error instanceof Error ? error.message : "Fetch failed", + }; + } + + if (!response.ok) { + const mapped = ERROR_MAP[response.status]; + return { + ok: false, + error: mapped ?? `HTTP ${String(response.status)}`, + statusCode: response.status, + }; + } + + if (!response.body) { + return { ok: false, error: "Empty response body" }; + } + + const bodyResult = await readBodyWithLimit( + response.body as ReadableStream + ); + if ("error" in bodyResult) { + return { ok: false, error: bodyResult.error }; + } + + const raw = new TextDecoder().decode(bodyResult.bytes); + const contentType = response.headers.get("content-type") ?? ""; + + const converted = convertContent(raw, contentType); + if ("error" in converted) { + return { ok: false, error: converted.error }; + } + + const truncated = converted.text.length > maxTextLength; + const text = truncated + ? converted.text.slice(0, maxTextLength) + : converted.text; + + return { ok: true, text, truncated }; +}; + +export { + collapseWhitespace, + convertContent, + DEFAULT_MAX_TEXT_LENGTH, + FETCH_TIMEOUT_MS, + MAX_BODY_BYTES, + MAX_REDIRECTS, + scrapeUrl, + USER_AGENT, +}; +export type { ScrapeResult }; diff --git a/apps/operator/src/utils/url-validator.test.ts b/apps/operator/src/utils/url-validator.test.ts index 0d8a9f8..a8eb001 100644 --- a/apps/operator/src/utils/url-validator.test.ts +++ b/apps/operator/src/utils/url-validator.test.ts @@ -1,62 +1,50 @@ import { describe, expect, it } from "vitest"; -import { parseAllowedDomains, validateSourceUrl } from "./url-validator"; - -const DOMAINS = ["telsu.fi", "blackrock.com"]; +import { validateSourceUrl } from "./url-validator"; describe("validateSourceUrl", () => { - it("accepts valid HTTPS URL on allowed domain", () => { - expect(validateSourceUrl("https://www.telsu.fi/", DOMAINS)).toEqual({ + it("accepts valid HTTPS URL", () => { + expect(validateSourceUrl("https://www.example.com/")).toEqual({ valid: true, }); }); - it("accepts subdomain of allowed domain", () => { + it("accepts HTTPS URL with path", () => { expect( - validateSourceUrl( - "https://www.blackrock.com/us/individual/insights", - DOMAINS - ) + validateSourceUrl("https://www.example.org/us/individual/insights") ).toEqual({ valid: true }); }); - it("accepts exact domain match", () => { - expect(validateSourceUrl("https://telsu.fi/page", DOMAINS)).toEqual({ - valid: true, - }); - }); - it("rejects HTTP URLs", () => { - const result = validateSourceUrl("http://www.telsu.fi/", DOMAINS); + const result = validateSourceUrl("http://www.example.com/"); expect(result).toEqual({ valid: false, reason: "Only HTTPS URLs are allowed", }); }); - it("rejects non-allowlisted domains", () => { - const result = validateSourceUrl("https://evil.com/", DOMAINS); - expect(result.valid).toBe(false); - expect(result).toHaveProperty("reason"); - }); - it("rejects localhost", () => { - const result = validateSourceUrl("https://localhost/", DOMAINS); + const result = validateSourceUrl("https://localhost/"); expect(result.valid).toBe(false); }); it("rejects 127.0.0.1", () => { - const result = validateSourceUrl("https://127.0.0.1/", DOMAINS); + const result = validateSourceUrl("https://127.0.0.1/"); expect(result.valid).toBe(false); }); it("rejects [::1]", () => { - const result = validateSourceUrl("https://[::1]/", DOMAINS); + const result = validateSourceUrl("https://[::1]/"); + expect(result.valid).toBe(false); + }); + + it("rejects 0.0.0.0", () => { + const result = validateSourceUrl("https://0.0.0.0/"); expect(result.valid).toBe(false); }); it("rejects URLs with credentials", () => { - const result = validateSourceUrl("https://user:pass@telsu.fi/", DOMAINS); + const result = validateSourceUrl("https://user:pass@example.com/"); expect(result).toEqual({ valid: false, reason: "URLs with credentials are not allowed", @@ -64,10 +52,7 @@ describe("validateSourceUrl", () => { }); it("rejects URLs over 2048 chars", () => { - const result = validateSourceUrl( - "https://telsu.fi/" + "a".repeat(2048), - DOMAINS - ); + const result = validateSourceUrl("https://example.com/" + "a".repeat(2048)); expect(result).toEqual({ valid: false, reason: "URL exceeds 2048 character limit", @@ -75,12 +60,12 @@ describe("validateSourceUrl", () => { }); it("rejects invalid URLs", () => { - const result = validateSourceUrl("not-a-url", DOMAINS); + const result = validateSourceUrl("not-a-url"); expect(result).toEqual({ valid: false, reason: "Invalid URL" }); }); it("rejects FTP URLs", () => { - const result = validateSourceUrl("ftp://telsu.fi/", DOMAINS); + const result = validateSourceUrl("ftp://example.com/"); expect(result).toEqual({ valid: false, reason: "Only HTTPS URLs are allowed", @@ -88,41 +73,17 @@ describe("validateSourceUrl", () => { }); it("rejects file URLs", () => { - const result = validateSourceUrl("file:///etc/passwd", DOMAINS); + const result = validateSourceUrl("file:///etc/passwd"); expect(result.valid).toBe(false); }); - it("rejects when allowed domains list is empty", () => { - const result = validateSourceUrl("https://telsu.fi/", []); + it("rejects 127.0.0.2 (full loopback range)", () => { + const result = validateSourceUrl("https://127.0.0.2/"); expect(result.valid).toBe(false); }); - it("prevents domain suffix attacks (eviltelsu.fi)", () => { - const result = validateSourceUrl("https://eviltelsu.fi/", DOMAINS); + it("rejects 127.255.255.255 (loopback range upper bound)", () => { + const result = validateSourceUrl("https://127.255.255.255/"); expect(result.valid).toBe(false); }); }); - -describe("parseAllowedDomains", () => { - it("parses comma-separated domains", () => { - expect(parseAllowedDomains("telsu.fi,blackrock.com")).toEqual([ - "telsu.fi", - "blackrock.com", - ]); - }); - - it("trims whitespace", () => { - expect(parseAllowedDomains(" telsu.fi , blackrock.com ")).toEqual([ - "telsu.fi", - "blackrock.com", - ]); - }); - - it("returns empty array for undefined", () => { - expect(parseAllowedDomains(undefined)).toEqual([]); - }); - - it("returns empty array for empty string", () => { - expect(parseAllowedDomains("")).toEqual([]); - }); -}); diff --git a/apps/operator/src/utils/url-validator.ts b/apps/operator/src/utils/url-validator.ts index 6f19fde..f32f731 100644 --- a/apps/operator/src/utils/url-validator.ts +++ b/apps/operator/src/utils/url-validator.ts @@ -1,15 +1,15 @@ /** - * Validates a source URL against the SSRF safety policy: + * Validates a source URL against the safety policy: * - HTTPS only - * - Hostname must be in the allowed domains list + * - No localhost / loopback addresses + * - No credentials in URL * - Max 2048 characters * * DNS-level private IP checks are deferred to the fetch layer * (Cloudflare Workers already block fetches to private IPs). */ const validateSourceUrl = ( - url: string, - allowedDomains: string[] + url: string ): { valid: true } | { valid: false; reason: string } => { if (url.length > 2048) { return { valid: false, reason: "URL exceeds 2048 character limit" }; @@ -34,36 +34,15 @@ const validateSourceUrl = ( if ( hostname === "localhost" || - hostname === "127.0.0.1" || + hostname.startsWith("127.") || + hostname === "::1" || hostname === "[::1]" || hostname === "0.0.0.0" ) { return { valid: false, reason: "Localhost URLs are not allowed" }; } - const isAllowed = allowedDomains.some((domain) => { - const d = domain.toLowerCase(); - return hostname === d || hostname.endsWith(`.${d}`); - }); - - if (!isAllowed) { - return { - valid: false, - reason: `Domain "${hostname}" is not in the allowed list`, - }; - } - return { valid: true }; }; -const parseAllowedDomains = (envValue: string | undefined): string[] => { - if (!envValue) { - return []; - } - return envValue - .split(",") - .map((d) => d.trim()) - .filter((d) => d.length > 0); -}; - -export { parseAllowedDomains, validateSourceUrl }; +export { validateSourceUrl }; diff --git a/apps/operator/wrangler.jsonc b/apps/operator/wrangler.jsonc index 87760ea..931f8fe 100644 --- a/apps/operator/wrangler.jsonc +++ b/apps/operator/wrangler.jsonc @@ -6,7 +6,7 @@ "compatibility_flags": ["nodejs_compat"], "observability": { "logs": { - "enabled": false, + "enabled": true, "invocation_logs": true, }, }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 59580f1..c8ecd16 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -38,6 +38,9 @@ importers: hono: specifier: 4.12.9 version: 4.12.9 + node-html-markdown: + specifier: 2.0.0 + version: 2.0.0 openai: specifier: 6.33.0 version: 6.33.0(ws@8.18.0)(zod@4.3.6) @@ -1340,6 +1343,9 @@ packages: blake3-wasm@2.1.5: resolution: {integrity: sha512-F1+K8EbfOZE49dtoPtmxUQrpXaBIl3ICvasLh+nJta0xkz+9kF/7uet9fLnwKqhDrmj6g+6K3Tw9yQPUg2ka5g==} + boolbase@1.0.0: + resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + brace-expansion@1.1.13: resolution: {integrity: sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==} @@ -1402,6 +1408,13 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-select@5.2.2: + resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==} + + css-what@6.2.2: + resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==} + engines: {node: '>= 6'} + data-view-buffer@1.0.2: resolution: {integrity: sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==} engines: {node: '>= 0.4'} @@ -1450,6 +1463,19 @@ packages: resolution: {integrity: sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==} engines: {node: '>=0.10.0'} + dom-serializer@2.0.0: + resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} + + domelementtype@2.3.0: + resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} + + domhandler@5.0.3: + resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} + engines: {node: '>= 4'} + + domutils@3.2.2: + resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==} + drizzle-kit@0.31.10: resolution: {integrity: sha512-7OZcmQUrdGI+DUNNsKBn1aW8qSoKuTH7d0mYgSP8bAzdFzKoovxEFnoGQp2dVs82EOJeYycqRtciopszwUf8bw==} hasBin: true @@ -1553,6 +1579,10 @@ packages: electron-to-chromium@1.5.328: resolution: {integrity: sha512-QNQ5l45DzYytThO21403XN3FvK0hOkWDG8viNf6jqS42msJ8I4tGDSpBCgvDRRPnkffafiwAym2X2eHeGD2V0w==} + entities@4.5.0: + resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} + engines: {node: '>=0.12'} + error-stack-parser-es@1.0.5: resolution: {integrity: sha512-5qucVt2XcuGMcEGgWI7i+yZpmpByQ8J1lHhcL7PwqCwu9FPP3VUXzT4ltHe5i2z9dePwEHcDVOAfSnHsOlCXRA==} @@ -1831,6 +1861,10 @@ packages: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} + he@1.2.0: + resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==} + hasBin: true + hermes-estree@0.25.1: resolution: {integrity: sha512-0wUoCcLp+5Ev5pDW2OriHC2MJCbwLwuRx+gAqMTOkGKJJiBCLjtrvy4PWUGn6MIVefecRpzoOZ/UV6iGdOr+Cw==} @@ -2137,9 +2171,19 @@ packages: resolution: {integrity: sha512-pyFS63ptit/P5WqUkt+UUfe+4oevH+bFeIiPPdfb0pFeYEu/1ELnJu5l+5EcTKYL5M7zaAa7S8ddywgXypqKCw==} engines: {node: '>= 0.4'} + node-html-markdown@2.0.0: + resolution: {integrity: sha512-DqUC3GGP7pwSYxS93SwHoP+qCw78xcMP6C6H2DuC8rPD2AweJRjBzQb5SdXpKtDlqAQ7hVotJcfhgU7hU5Gthw==} + engines: {node: '>=20.0.0'} + + node-html-parser@6.1.13: + resolution: {integrity: sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==} + node-releases@2.0.36: resolution: {integrity: sha512-TdC8FSgHz8Mwtw9g5L4gR/Sh9XhSP/0DEkQxfEFXOpiul5IiHgHan2VhYYb6agDSfp4KuvltmGApc8HMgUrIkA==} + nth-check@2.1.1: + resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -3630,6 +3674,8 @@ snapshots: blake3-wasm@2.1.5: {} + boolbase@1.0.0: {} + brace-expansion@1.1.13: dependencies: balanced-match: 1.0.2 @@ -3695,6 +3741,16 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-select@5.2.2: + dependencies: + boolbase: 1.0.0 + css-what: 6.2.2 + domhandler: 5.0.3 + domutils: 3.2.2 + nth-check: 2.1.1 + + css-what@6.2.2: {} + data-view-buffer@1.0.2: dependencies: call-bound: 1.0.4 @@ -3741,6 +3797,24 @@ snapshots: dependencies: esutils: 2.0.3 + dom-serializer@2.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + entities: 4.5.0 + + domelementtype@2.3.0: {} + + domhandler@5.0.3: + dependencies: + domelementtype: 2.3.0 + + domutils@3.2.2: + dependencies: + dom-serializer: 2.0.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + drizzle-kit@0.31.10: dependencies: '@drizzle-team/brocli': 0.10.2 @@ -3760,6 +3834,8 @@ snapshots: electron-to-chromium@1.5.328: {} + entities@4.5.0: {} + error-stack-parser-es@1.0.5: {} es-abstract@1.24.1: @@ -4222,6 +4298,8 @@ snapshots: dependencies: function-bind: 1.1.2 + he@1.2.0: {} + hermes-estree@0.25.1: {} hermes-parser@0.25.1: @@ -4510,8 +4588,21 @@ snapshots: object.entries: 1.1.9 semver: 6.3.1 + node-html-markdown@2.0.0: + dependencies: + node-html-parser: 6.1.13 + + node-html-parser@6.1.13: + dependencies: + css-select: 5.2.2 + he: 1.2.0 + node-releases@2.0.36: {} + nth-check@2.1.1: + dependencies: + boolbase: 1.0.0 + object-assign@4.1.1: {} object-inspect@1.13.4: {}