Web Scraping on Modal

This example shows how you can scrape links from a website and post them to a Slack channel using Modal.

import os

import modal

app = modal.App("example-linkscraper")


playwright_image = modal.Image.debian_slim(
    python_version="3.10"
).run_commands(  # Doesn't work with 3.11 yet
    "apt-get update",
    "apt-get install -y software-properties-common",
    "apt-add-repository non-free",
    "apt-add-repository contrib",
    "pip install playwright==1.42.0",
    "playwright install-deps chromium",
    "playwright install chromium",
)


@app.function(image=playwright_image)
async def get_links(url: str) -> set[str]:
    from playwright.async_api import async_playwright

    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        links = await page.eval_on_selector_all(
            "a[href]", "elements => elements.map(element => element.href)"
        )
        await browser.close()

    return set(links)


slack_sdk_image = modal.Image.debian_slim(python_version="3.10").pip_install(
    "slack-sdk==3.27.1"
)


@app.function(
    image=slack_sdk_image,
    secrets=[modal.Secret.from_name("scraper-slack-secret")],
)
def bot_token_msg(channel, message):
    import slack_sdk
    from slack_sdk.http_retry.builtin_handlers import RateLimitErrorRetryHandler

    client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"])
    rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=3)
    client.retry_handlers.append(rate_limit_handler)

    print(f"Posting {message} to #{channel}")
    client.chat_postMessage(channel=channel, text=message)


@app.function()
def scrape():
    links_of_interest = ["http://modal.com"]

    for links in get_links.map(links_of_interest):
        for link in links:
            bot_token_msg.remote("scraped-links", link)


@app.function(schedule=modal.Period(days=1))
def daily_scrape():
    scrape.remote()


@app.local_entrypoint()
def run():
    scrape.remote()