# robots.txt for TapSay (https://tapsay.me)
# Updated: 2026-04-25
# AI/LLM context: https://tapsay.me/llms.txt, https://tapsay.me/llms-ctx.txt, https://tapsay.me/llms-full.txt

# Default — allow all crawlers, block internals
User-agent: *
Disallow: /api/
Disallow: /admin/
Disallow: /.git/
Disallow: /node_modules/
Disallow: /affiliate-link
Disallow: /*.json$
Allow: /

# Note: query strings (?lang, ?q, ?utm_, ?paywall, ?source=pwa) are NOT
# blocked. Each page declares <link rel="canonical"> to a clean URL, so
# Google consolidates the variants. Blocking them via robots prevents
# Google from confirming the canonical and traps URLs in
# "Duplicate without user-selected canonical" in GSC.

# --- Primary search engines ---
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Mobile
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: Yandex
Allow: /

User-agent: YandexMobileBot
Allow: /

User-agent: Baiduspider
Allow: /

# --- AI / LLM crawlers (whitelisted for GEO) ---
# OpenAI
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Anthropic
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: Anthropic-AI
Allow: /

User-agent: Claude-SearchBot
Allow: /

# Google AI (Gemini / AI Overviews)
User-agent: Google-Extended
Allow: /

User-agent: GoogleOther
Allow: /

User-agent: GoogleOther-Image
Allow: /

# xAI (Grok)
User-agent: xai-bot
Allow: /

User-agent: Grok
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Meta AI
User-agent: FacebookBot
Allow: /

User-agent: Meta-ExternalAgent
Allow: /

User-agent: meta-externalagent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

# Microsoft Copilot / Bing AI
User-agent: BingPreview
Allow: /

User-agent: CopilotBot
Allow: /

# Apple Intelligence / Siri
User-agent: Applebot
Allow: /

User-agent: Applebot-Extended
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

User-agent: cohere-training-data-crawler
Allow: /

# Amazon / Alexa
User-agent: Amazonbot
Allow: /

# ByteDance (TikTok AI / Doubao)
User-agent: Bytespider
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Brave Search AI
User-agent: Brave
Allow: /

# Common Crawl (used by many AI models for training)
User-agent: CCBot
Allow: /

# AI21 Labs / AI2 (Ai2 AllenAI)
User-agent: AI2Bot
Allow: /

User-agent: AI2Bot-Dolma
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# DeepSeek
User-agent: DeepSeekBot
Allow: /

# Kagi Search
User-agent: Kagibot
Allow: /

# Neeva
User-agent: Neevabot
Allow: /

# --- Aggressive SEO scrapers (blocked to preserve crawl budget) ---
User-agent: AhrefsBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: Rogerbot
Disallow: /

User-agent: Exabot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: SeznamBot
Disallow: /

Sitemap: https://tapsay.me/sitemap-index.xml
Sitemap: https://tapsay.me/sitemap.xml
Sitemap: https://tapsay.me/sitemap-images.xml

# AI/LLM context files — llmstxt.org specification
# https://tapsay.me/llms.txt        — concise index
# https://tapsay.me/llms-ctx.txt    — token-efficient context bundle
# https://tapsay.me/llms-full.txt   — extended full context
# https://tapsay.me/ai.txt          — AI training & inference permissions (citation-friendly)
#
# Subscription: https://tapsay.me/feed.xml — RSS 2.0 feed of blog posts
# Trust files:  https://tapsay.me/humans.txt
#               https://tapsay.me/.well-known/security.txt