Example (webscraper.py)

This is the source code for 10_integrations.webscraper.

import os

import modal

app = modal.App(
    "example-linkscraper"
)  # Note: prior to April 2024, "app" was called "stub"


playwright_image = modal.Image.debian_slim(
    python_version="3.10"
).run_commands(  # Doesn't work with 3.11 yet
    "apt-get update",
    "apt-get install -y software-properties-common",
    "apt-add-repository non-free",
    "apt-add-repository contrib",
    "pip install playwright==1.42.0",
    "playwright install-deps chromium",
    "playwright install chromium",
)


@app.function(image=playwright_image)
async def get_links(url: str) -> set[str]:
    from playwright.async_api import async_playwright

    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        links = await page.eval_on_selector_all(
            "a[href]", "elements => elements.map(element => element.href)"
        )
        await browser.close()

    return set(links)


slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk")


@app.function(
    image=slack_sdk_image,
    secrets=[modal.Secret.from_name("scraper-slack-secret")],
)
def bot_token_msg(channel, message):
    import slack_sdk

    print(f"Posting {message} to #{channel}")
    client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"])
    client.chat_postMessage(channel=channel, text=message)


@app.function()
def scrape():
    links_of_interest = ["http://modal.com"]

    for links in get_links.map(links_of_interest):
        for link in links:
            bot_token_msg.remote("scraped-links", link)


@app.function(schedule=modal.Period(days=1))
def daily_scrape():
    scrape.remote()


@app.local_entrypoint()
def run():
    scrape.remote()