# NiaMeowDB robots.txt
#
# Strategy: block AI *training* crawlers, allow AI *search/citation* crawlers.
# We want ChatGPT, Perplexity, Gemini, etc. to link users to NiaMeowDB when they
# ask about MapleStory Classic — we do NOT want our content used
# for model training or bulk-harvested by competitors.
#
# References:
#  - https://platform.openai.com/docs/bots (OpenAI: GPTBot = training, OAI-SearchBot/ChatGPT-User = search)
#  - https://darkvisitors.com (crawler directory)
#  - https://support.google.com/webmasters/answer/80553 (Google-Extended = Gemini training)

# ===== AI training crawlers — BLOCKED =====

User-agent: GPTBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: Amazonbot
Disallow: /

User-agent: Meta-ExternalAgent
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: Diffbot
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: PanguBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: AhrefsBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: PetalBot
Disallow: /

# Sogou family (Chinese search). YisouSpider in particular was hammering
# the site across multiple CN ASNs in May 2026 — crawling every guide /
# class / item / monster including the ?_rsc=... prefetch variants. We get
# no meaningful search traffic from these crawlers in our target markets,
# so they're pure billed-request cost. Also blocked at the WAF (custom
# rule 3) since rude crawlers routinely ignore robots.txt.
User-agent: YisouSpider
Disallow: /

User-agent: Sogou web spider
Disallow: /

User-agent: Sogou inst spider
Disallow: /

User-agent: Sogou Pic Spider
Disallow: /

# ===== AI search / citation crawlers — ALLOWED =====
# These fetch pages in real-time when a user asks a question, so NiaMeowDB can
# appear as a cited source in AI answers. Do NOT block these.
#
#   OAI-SearchBot     — SearchGPT / ChatGPT search indexing
#   ChatGPT-User      — ChatGPT browse-on-demand (user-initiated)
#   PerplexityBot     — Perplexity indexing + citations
#   Perplexity-User   — Perplexity user-initiated fetches
#   Google-InspectionTool — Search Console previews
#
# (No Disallow line needed — everything not explicitly blocked is allowed.)

# ===== Everyone else (search engines, normal bots) — ALLOWED =====

User-agent: *
Disallow: /msclassic/admin/
# Image-only routes that have no SEO value and only ever exist as <img> /
# og:image targets. These were previously kept crawlable + noindex'd, which
# correctly suppresses indexing but inflates GSC's "Crawled - currently not
# indexed" report (~644 of 1,692 URLs as of 2026-05-06). Disallow moves them
# to the small "Blocked by robots.txt" bucket instead. Existing X-Robots-Tag:
# noindex headers in next.config.mjs stay as defense-in-depth.
Disallow: /msclassic/api/assets/
Disallow: */opengraph-image*
# /msclassic/api/, /msclassic/login, /msclassic/reset-password are
# intentionally NOT Disallow'd here. They're heavily internally linked, so
# blocking crawl only produces "Indexed, though blocked by robots.txt" warnings
# in GSC. Instead they're kept out of the index via noindex:
#   - /msclassic/api/*             → X-Robots-Tag: noindex (next.config.mjs)
#   - /msclassic/login             → robots: noindex (page metadata)
#   - /msclassic/reset-password    → robots: noindex (page metadata)

# ===== Honeypot trap paths =====
# These do NOT exist as real content. They are Disallow-baited URLs that
# every honest crawler (Googlebot, Bingbot, OAI-SearchBot, PerplexityBot,
# ClaudeBot, etc.) will respect and skip. Dishonest scrapers either
# ignore robots.txt outright OR specifically target Disallow entries
# because they assume those paths are hidden-value content. Every GET
# against these routes lands in the honeypot_hits D1 table; review at
# /msclassic/admin/honeypot to identify IPs/ASNs to promote into the WAF
# custom block rule. See src/app/trap/[source]/[slug]/route.ts.
Disallow: /trap/r/premium-builds-leaked
Disallow: /trap/r/cash-shop-discount-codes
Disallow: /trap/h/

Sitemap: https://meowdb.com/sitemap-index.xml
Sitemap: https://meowdb.com/sitemap.xml
Sitemap: https://meowdb.com/msclassic/sitemap.xml