From Just another day in the life of a linux sysadmin
Revision as of 08:55, 29 July 2017 by Joelparks (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search


 Creating the robots.txt file is relatively easy. The location it's placed will control the bot traffic for that location so placement is important. You will likely need these in the domains public_html folder as placing these within subfolders will not limit bots for upper hierarchical folders. The file MUST be called robots.txt and the format is pretty simple. It should contain what bot you want to control as well as what rules you want it to follow:

The format you would need for your domains is as follows:

 User-agent:  *  
 Crawl-delay:  50 

This is what the above is basically doing:

User-agent:  *     <---- The star here says all bots must follow this rule you can adjust this for specific bots
Crawl-delay:  50   <---- The rule limits each bot to a delay of 50​

There are other rules that can be used but for the most part crawl delay is the most important for System Admins

EA3 Bot search

date=$(date +%d/%b/%Y);echo -e "\n\e[1;31m=== Bots (robots or crawlers) ===\e[0m\n"; find /usr/local/apache/domlogs/*/ -type f|grep -v -E $'(_|-)log|.gz'|xargs grep -H "" | perl -ne 'if (/$date/ && /\/usr\/local\/apache\/domlogs\/.*\/(.*):(\d{1,3}(?:\.\d{1,3}){3}).*\((?:.*?;)*([^;]*(?:b(?:ot|ing)|crawl|yahoo|google|spider)[^;]*);/i) { print ("$1\t$2\t$3\n")}'|sort|uniq -c|sort -rn|awk '{print $1" "$3" "$4" "$2}'|column -t|head

Scrape Report

blue='\e[1;96m';red='\e[1;31m';green='\e[1;32m';yellow='\e[1;33m';purple='\e[1;35m';reset='\e[0m';if -d "/usr/local/cpanel/" ;then if egrep "HTTPD_ROOT[^/etc]+\/etc") ;then logPath="/home/domlogs/";else logPath="/var/log/apache2/domlogs/";fi;while read domain user doc;do echo -e "${blue}${domain}${reset}";file="${doc}/robots.txt";log="${logPath}${domain}";if -f "$log" ;then echo -e "${purple}domlog - $log${reset}";bots=$(egrep -o "[a-zA-Z0-9]*(spider|crawler|bot)\/[0-9]\.[0-9]" $log |cut -d"/" -f1|sort|uniq -c|sort);if -z "$bots" ;then echo -e "${yellow}Bots aren't crawling${reset}";else echo -e "${blue}Bots are crawling"'!'"${reset}";while read count bot;do echo -e "${red}   $count hits by $bot${reset}";done < <(echo ${bots}|sed -r "s/([a-zA-Z]+)\ /\1\n/g");if -f "$file" ;then echo -e "${green}Checking for the following bots in ${file}${reset}";while read bot;do if $(grep "$bot" "$file") ;then echo -e "${green}   $bot exists${reset}";else echo -e "${red}   $bot doesn't exist${reset}";listToAdd="$listToAdd $bot";fi;done < <(echo ${bots}|sed -r "s/([a-zA-Z]+)\ /\1\n/g"|awk '{print$2}');else echo -e "${red}robots.txt - not found${reset}";fi;fi;else echo -e "${red}domlog - not found${reset}";fi;echo;done < <(grep -v '=parked=' /etc/userdatadomains|sed -r "s/^(.*+)\: ([^=]+)=+[^=]+=+([^=]+)=+[^=]+=+([^=]+)=+.*$/\1 \2 \4/g");else echo "This is meant to be run on a cPanel server";fi