Heres a "robots.txt" that can be used with Confluence

# Note: this files uses parameters specific to Google, parameters that are not in robots.txt standard
# http://www.google.com/support/webmasters/, http://www.robotstxt.org/wc/faq.html and http://en.wikipedia.org/wiki/Robots_Exclusion_Standard were used to research said parameters
# some links shouldn't show to an anonymous browser such as GAS but are included for completeness

User-agent: * # match all bots. GSA is our primarly crawler but logs indicate there may be others on our Intranet
Crawl-delay: 5 # per http://en.wikipedia.org/wiki/Robots.txt#Nonstandard_extensions, sets number of seconds to wait between requests to 5 seconds. may not work

Disallow: /pages/ # this line to purge GSA of all old page entries, will be removed in next iteration so that specific /pages/ lines below take effect
Disallow: /admin/ # administrator links
Disallow: /adminstrators.action? # remove any administrator links
Disallow: /createrssfeed.action? # remove internal RSS links
Disallow: /dashboard.action? # remove the dashboard, heavy resource hit
Allow: /display # ensure primary display pages are allowed
Disallow: /display/*&tasklist.complete= # remove tasklist links
Disallow: /display/*?decorator=printable # remove printable version links
Disallow: /display/*?focusedCommentId= # remove page comment focus links
Disallow: /display/*?refresh= # prevent crawler from clicking refresh button
Disallow: /display/*?replyToComment= # remove reply to comment links
Disallow: /display/*?rootCommentId= # remove news comment focus links
Disallow: /display/*?showComments=true&showCommentArea=true#addcomment # remove add comment links
Disallow: /doexportpage.action? # remove pdf export links
Disallow: /dopeopledirectorysearch.action # people search
Disallow: /dosearchsite.action? # remove specific site searches
Disallow: /exportword? # remove word export links
Disallow: /login.action? # remove the login page

# Allow: /pages/viewpage.action?* # allows indexing of pages with invalid titles for html (such as ?'s). Unfortunately currently allows page history to sneak in
Disallow: /pages/ # this line to purge GSA of all old page entries, will be removed in next iteration so that specific /pages/ lines below take effect
Disallow: /pages/copypage.action? # remove copy page links
Disallow: /pages/createblogpost.action? # remove add news links
Disallow: /pages/createpage.action? # remove add page links
Disallow: /pages/diffpages.action? # remove page comparison pages
Disallow: /pages/diffpagesbyversion.action? # remove page comparison links
Disallow: /pages/editblogpost.action? # remove edit news links
Disallow: /pages/editpage.action? # remove edit page links
Disallow: /pages/removepage.action? # remove the remove page links
Disallow: /pages/revertpagebacktoversion.action? # remove reversion links
Disallow: /pages/templates # remove template pages
Disallow: /pages/templates/ # block template indexes
Disallow: /pages/viewchangessincelastlogin.action? # remove page comparison pages
Disallow: /pages/viewpagesrc.action? # remove view page source links
Disallow: /pages/viewpreviouspageversions.action? # remove the link to previous versions
Disallow: /plugins/ # blocks plug-in calls
Disallow: /rpc/ # remove any RPC links
Disallow: /searchsite.action? # remove the wiki search engine pages
Disallow: /spaces/ # remove space action pages
Disallow: /themes/ # theme links
Disallow: /users/ # remove user action pages
Disallow: /x/ # remove tiny link urls

# End file