From autonomous research agents to production RAG pipelines โ see how AI teams use ContextZip to cut token costs and ship faster.
from langchain.tools import Tool import requests def read_web_page(url: str) -> str: """Fetch clean Markdown from any URL.""" r = requests.post( "https://contextzip.com/v1/extract", headers={"X-API-Key": CONTEXTZIP_KEY}, json={"url": url, "mode": "clean"}, timeout=30 ) data = r.json() return data["data"]["markdown"] # Register as an agent tool web_reader = Tool( name="read_url", func=read_web_page, description="Read and extract content from any web URL. " "Returns clean Markdown. Use for articles, docs, reports." ) # Add to your agent agent = initialize_agent( tools=[web_reader, ...], llm=ChatOpenAI(model="gpt-4o"), agent=AgentType.OPENAI_FUNCTIONS, )
# tools/read_url.yml โ add to your OpenClaw agent name: read_url description: | Fetch and extract clean Markdown content from any web URL. Use when you need to read articles, documentation, news, or any web page to gather information. endpoint: https://contextzip.com/v1/extract method: POST headers: X-API-Key: ${CONTEXTZIP_API_KEY} Content-Type: application/json body_template: url: ${url} mode: "clean" response_mapping: content: data.markdown title: data.title cached: cached parameters: - name: url type: string description: "The full URL of the web page to read" required: true validation: pattern: "^https?://" --- # .env โ set your key CONTEXTZIP_API_KEY=czk_your_key_here
import requests from openai import OpenAI from pinecone import Pinecone client = OpenAI() pc = Pinecone(api_key=PINECONE_KEY) index = pc.Index("my-knowledge-base") def ingest_url(url: str, doc_id: str): # 1. Extract clean Markdown r = requests.post( "https://contextzip.com/v1/extract", headers={"X-API-Key": CONTEXTZIP_KEY}, json={"url": url, "mode": "clean"} ) content = r.json()["data"]["markdown"] # 2. Chunk by headings (Markdown makes this trivial) chunks = [c.strip() for c in content.split("\n## ") if c] # 3. Embed and upsert vectors = [] for i, chunk in enumerate(chunks): emb = client.embeddings.create( input=chunk, model="text-embedding-3-small" ).data[0].embedding vectors.append({ "id": f"{doc_id}_{i}", "values": emb, "metadata": {"url": url, "chunk": chunk[:500]} }) index.upsert(vectors=vectors) print(f"Ingested {len(vectors)} chunks from {url}")
import requests from openai import OpenAI client = OpenAI() def extract_price(product_url: str) -> dict: # Get clean page content page = requests.post( "https://contextzip.com/v1/extract", headers={"X-API-Key": CONTEXTZIP_KEY}, json={"url": product_url, "mode": "clean"} ).json()["data"]["markdown"] # Let GPT extract structured data result = client.chat.completions.create( model="gpt-4o-mini", response_format={"type": "json_object"}, messages=[{ "role": "user", "content": f"Extract: name, price, currency, in_stock\n\n{page}" }] ) return result.choices[0].message.content # Monitor 1000 product pages for ~$3 total prices = [extract_price(url) for url in product_urls]
const sources = [ "https://techcrunch.com/latest", "https://hnrss.org/frontpage", "https://news.ycombinator.com", ]; async function buildDigest(urls) { // Parallel extraction โ all cached after first run const pages = await Promise.all( urls.map(url => fetch("https://contextzip.com/v1/extract", { method: "POST", headers: { "X-API-Key": process.env.CONTEXTZIP_KEY, "Content-Type": "application/json", }, body: JSON.stringify({ url, mode: "summary" }), // $0.001/req }).then(r => r.json()) ) ); const combined = pages .map(p => `### ${p.data.title}\n${p.data.markdown}`) .join("\n\n---\n\n"); return summarizeWithLLM(combined); // Your summarization step }
import hashlib, requests from openai import OpenAI client = OpenAI() DB = {} # your actual DB here def check_for_changes(competitor_url: str): r = requests.post( "https://contextzip.com/v1/extract", headers={"X-API-Key": CONTEXTZIP_KEY}, json={"url": competitor_url, "mode": "clean"} ) data = r.json() # Skip if cached (no content change) if data.get("cached"): return None content = data["data"]["markdown"] content_hash = hashlib.sha256(content.encode()).hexdigest()[:12] previous = DB.get(competitor_url) DB[competitor_url] = {"hash": content_hash, "content": content} if previous and previous["hash"] != content_hash: return summarize_diff(previous["content"], content) def summarize_diff(old, new): resp = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": f"What changed between these two versions?\n\nOLD:\n{old[:2000]}\n\nNEW:\n{new[:2000]}"}] ) return resp.choices[0].message.content