# AWSTATS ROBOTS DATABASE #------------------------------------------------------- # If you want to add robots to extend AWStats database detection capabilities, # you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. # The entry in RobotsSearchIDOrder_listx is a Perl regular expression # (see http://perldoc.perl.org/perlreref.html). AWSTats applies these # expressions to the user agent string in the order given by the lists. The # first match specifies the robot. # # The corresponding entry in RobotsHashIDLib contains the regular expression # as key, followed by a string containing HTML-text. AWStats inserts this # text into reports to describe the bot. If possible the text should contain # a link to the bot home page. This make it easier for systadmins to find # the information necessary e.g. to adapt the robots.txt file. # # An entry in the RobotsAffiliateLib is not necessary. An entry in this list # contains as first part the regular expression specifying the bot. The # second part is a string that gives the Company or product managing the bot. # This information is not used yet. #------------------------------------------------------- # 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html # added dipsie (not tested with real data). # added DomainsDB.net http://domainsdb.net/ # added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) # added Nutch (used by looksmart (furl?)) # added rssImagesBot # added Sqworm # added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e # added w3c css-validator # added documentation link to bot home pages for above and selected major bots. # In the case of international bots, choose .com page. # Included tool tip (html "title"). # To do: parameterize to match both AWStats language and tooltips settings. # To do: add html links for all bots based on current documentation in source # files referenced below. # changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) # made minor grammar corrections to notes below # 2005-08-24 added YahooSeeker-Testing # added w3c-checklink # updated url for ask.com # 2005-08-24 added Girafabot http://www.girafa.com/ # 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ # added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) # dded geniebot (wgao@genieknows.com) # added BecomeBot link http://www.become.com/site_owners.html # added topicblogs http://www.topicblogs.com/ # added Powermarks; seen used by referrer spam # added YahooSeeker # added NG/2. http://www.exabot.com/ # 2005-09-15 added link for Walhello appie # added bender focused_crawler # updated YahooSeeker description (blog crawler) # 2005-09-16 added link for http://linkchecker.sourceforge.net # added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) # added Blogslive info@blogslive.com intelliseek.com # added BlogPulse (ISSpider-3.0) intelliseek.com # 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) # added EverbeeCrawler # added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html # added link for Bloglines http://www.bloglines.com # 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) # added Blogshares Spiders (Synchronized V1.5.1) # added yacy # 2005-11-21 added Argus www.simpy.com # added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) # added MJ12bot http://majestic12.co.uk/bot.php # added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) # added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) # added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html # added Seekbot (http://www.seekbot.net/bot.html) # added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) # added link for BaiDuSpider # added link for Blogshares Spider # added link for StackRambler http://www.rambler.ru/doc/faq.shtml # added link for WISENutbot # added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut # 2005-12-15 # added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. # added findlinks http://wortschatz.uni-leipzig.de/findlinks/ # added IBM Almaden Research Center WebFountainâ„¢ http://www.almaden.ibm.com/cs/crawler [hc3] # added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) # added lmspider (lmspider@scansoft.com) http://www.nuance.com/ # added noxtrumbot http://www.noxtrum.com/ # added SandCrawler (Microsoft) # added SBIder http://www.sitesell.com/sbider.html # added SeznamBot http://fulltext.seznam.cz/ # added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) # added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net # added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) # added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ # added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html # added link for GigaBot # added link for MagpieRSS # added link for MSIECrawler # 2005-12-21 # added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] # added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) # added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] # added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ # added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. # added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] # added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? # 2005-12-22 # added EARTHCOM.info www.earthcom.info # added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] # added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] # 2006-01-01 # added Dulance http://www.dulance.com/bot.jsp # added MojeekBot http://www.mojeek.com/bot.html # added nicebot http://www.egghelp.org/setup.htm ? # added Snappy http://www.urltrends.com/faq.php # added sohu agent # added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] # added zspider http://feedback.redkolibri.com/ # 2006-01-13 # added boitho.com-dc http://www.boitho.com/dcbot.html # added IRLbot http://irl.cs.tamu.edu/crawler # added virus_detector virus_harvester@securecomputing.com # added Wavefire http://www.wavefire.com; info@wavefire.com # added WebFilter Robot # 2006-01-24 # added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp # added Exabot exabot.com # added LetsCrawl.com http://letscrawl.com # added ichiro http://help.goo.ne.jp/door/crawlerE.html # 2006-01-27 additional 22 robots from a list provided by Moizes Gabor # added ALeadSoftbot http://www.aleadsoft.com/bot.htm # added CipinetBot http://www.cipinet.com/bot.html # added Cuasarbot http://www.cuasar.com/ # added Dumbot http://www.dumbfind.com/ # added Extreme_Picture_Finder http://www.exisoftware.com/ # added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots # added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it # added InsurancoBot http://www.fastspywareremoval.com/ # added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca # added Kurzor http://www.easymail.hu/ cursor@easymail.hu # added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added Orbiter http://www.dailyorbit.com/bot.htm # added PHP_version_tracker http://www.nexen.net/phpversion/bot.php # added SuperBot http://www.sparkleware.com/superbot/ # added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com # added TestBot http://www.agbrain.com/ # added TutorGigBot http://www.tutorgig.info/ # added WebIndexer mailto://webindexerv1@yahoo.com # added WebMiner http://64.124.122.252/feedback.html # 2006-02-01 # added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 # added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 # additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] # added Candlelight_Favorites_Inspector # added DomainChecker # added EasyDL # added FavOrg # added Favorites_Sweeper # added Html_Link_Validator # added Internet_Ninja # added JRTwine_Software_Check_Favorites_Utility # fixed Microsoft_URL_Control # added miniRank # added Missigua_Locator # added NPBot # added Ocelli # added Onet.pl_SA # added proodleBot # added SearchGuild_DMOZ_Experiment # added Susie # added Website_Monitoring_Bot # added Xenu_Link_Sleuth # 2006-05-15 # added ASPseek http://www.aspseek.org/ # added AdamM Bot http://home.blic.net/adamm/ # added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html # added arianna.libero.it (Italian Portal/search engine) # added Biz360 spider http://www.biz360.com # added BlogBridge Service http://www.blogbridge.com/ # added BlogSearch http://www.icerocket.com/ # added libcrawl # added edgeio-relanshanbottriever http://www.edgeio.com # added FeedFlow http://feedflow.com/about # added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt # added Java catchall - used by many spam bots # added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb # added msnbot-media http://search.msn.com/msnbot.htm # added MT::Telegraph::Agent # added Netluchs http://www.netluchs.de/ (German SE bot) # added oBot http://www.webmasterworld.com/forum11/1616.htm # added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. # added ping.blo.gs http://blo.gs/ping.php blog bot # added Sphere Scout http://www.sphere.com/ # added sproose crawler http://www.sproose.com/bot.html # added SyndicAPI http://syndicapi.com/bot.html # added Yahoo! Mindset http://mindset.research.yahoo.com/ # added msrabot # added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk # fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) # changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. # This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. # 2006-05-17 # added Alpha Search Agent # 62.152.125.60 Eurologon Srl # added Krugle http://www.krugle.com/crawler/info.html the search engine for developers # added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine # added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ # added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html # You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports # 2006-05-20 # added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml # added Accoona-AI-Agent http://www.accoona.com/ # added ActiveBookmark http://www.libmaster.com/active_bookmark.php # added BIGLOTRON http://www.biglotron.com/robot.html # added Bookmark-Manager http://bkm.sourceforge.net/ # added cbn00glebot # added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork # added CheckWeb link validator http://p.duby.free.fr/chkweb.htm # added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html # added ConveraCrawler http://www.authoritativeweb.com/crawl/ # added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ # added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php # added Cursor http://adcenter.hu/docs/en/bot.html # added Custo http://www.netwu.com/custo/ # added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ # added Deepindex http://www.deepindex.net/faq.php # added DNSGroup http://www.dnsgroup.com/ # added DoCoMo http://www.nttdocomo.co.jp/ # added dumm.de-Bot http://www.dumm.de/ # added ETS v http://www.freetranslation.com/help/ # added eventax http://www.eventax.de/ # added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ # added FAST Enterprise Crawler http://www.fast.no/ # added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ # added FeedValidator http://feedvalidator.org/ # added FilmkameraBot http://www.filmkamera.at/bot.html # added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece # added Global Fetch http://www.wesonet.com/ # added GOFORITBOT http://www.goforit.com/about/ # added GoForIt.com http://www.goforit.com/about/ # added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php # added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ # added HPPrint # added HTMLParser http://htmlparser.sourceforge.net/ # added Hundesuche.com-Bot http://www.hundesuche.com/ # added InfoBot http://www.infobot.org/ # added InfociousBot http://corp.infocious.com/tech_crawler.php # added InternetSupervision http://internetsupervision.com/ # added isearch2006 http://www.yahoo.com.cn/ # added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ # added KalamBot http://64.124.122.251/feedback.html # added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ # added Kevin http://dznet.com/kevin/ # added KnowItAll http://www.cs.washington.edu/research/knowitall/ # added Knowledge.com http://www.knowledge.com/ # added Kouaa Krawler http://www.kouaa.com/ # added ksibot http://ego.ms.mff.cuni.cz/ # added Link Valet Online http://www.htmlhelp.com/tools/valet/ # added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request # added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm # added MapoftheInternet.com http://MapoftheInternet.com/ # added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ # added Megite http://www.megite.com/ # added Metaspinner http://index.meta-spinner.de/ # added Mini-reptile # added Misterbot http://www.misterbot.fr/ # added Miva http://www.miva.com/ # added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b # added MSRBOT http://research.microsoft.com/research/sv/msrbot/ # added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 # added Mydoyouhike http://www.doyouhike.net/my # added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b # added NetSprint http://www.netsprint.pl/serwis/ # added NimbleCrawler http://www.healthline.com/ # added OpenWebSpider http://www.openwebspider.org/ # added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html # added OSSProxy http://www.marketscore.com/FAQ.Aspx # added passwordmaker.org http://passwordmaker.org/ # added PEAR HTTP Request class http://pear.php.net/ # added PEERbot http://www.peerbot.com/ # added PHP version tracker http://www.nexen.net/phpversion/bot.php # added PictureOfInternet http://malfunction.org/poi/ # added plinki http://www.plinki.com/ # added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b # added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b # added ProjectWF-java-test-crawler # added PyQuery http://sourceforge.net/projects/pyquery/ # added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ # added Scumbot # added Sensis Web Crawler http://www.sensis.com.au/ # added snap.com beta crawler http://www.snap.com/ # added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm # added Suchfin-Bot http://www.suchfin.de/ # added Sunrise http://www.sunrisexp.com/ # added Tagyu Agent http://www.tagyu.com/ # added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm # added TeragramCrawlerSURF http://www.teragram.com/ # added Test Crawler http://netp.ath.cx/ # added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ # added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html # added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) # added updated http://www.updated.com/ # added Vermut http://vermut.aol.com # added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html # added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb # added VSE http://www.vivisimo.com/ # added webcrawl.net http://www.webcrawl.net/ # added Web Downloader http://www.krasu.ru/soft/chuchelo/ # added Webdup http://www.webdup.com/en/index.html # added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b # added WordPress http://wordpress.org/ # added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ # added Xenu's Link Sleuth (with ') # added xirq http://www.xirq.com/ # added yoogliFetchAgent http://www.yoogli.com/ # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ # -- fix - some robots were reported with _ where _ should have been a space. # changed Xenu Link Sleuth # changed microsoft[_+ ]url[_+ ]control -> microsoft_url_control # changed favorites_sweeper -> favorites_sweeper # -- updates # updated AskJeeves to Ask # 2012-06-05 Albrecht Mueller # added Grabber from SDSC (San Diego Supercomputer Center). # 2013-09-30 Albrecht Mueller # AWStats probably cannot detect this bot as it identifies itself in # the referrer field and not in the user agent string. #92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" #92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" #92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" # to do MS Search 4.0 Robot #package AWSROB; # Robots list was found at http://www.robotstxt.org/wc/active/all.txt # Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html # Rem: To avoid bad detection, some robot's ids were removed from this list: # - Robots with ID of 3 letters only # - Robots called 'webs' and 'tcl' # Rem: directhit changed into direct_hit (its real id) # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser # Rem: roadrunner changed into road_runner # Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser # Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser # RobotsSearchIDOrder # It contains all matching criteria to search for in log fields. This list is # used to know in which order to search Robot IDs. # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more # Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+ ]' and are quoted. #------------------------------------------------------- @RobotsSearchIDOrder_list1 = ( # Common robots (In robot file) 'appie', 'architext', 'bingpreview', 'bjaaland', 'contentmatch', 'ferret', 'googlebot\-image', 'googlebot', 'google\-sitemaps', 'google[_+ ]web[_+ ]preview', 'grabber', 'gulliver', 'virus[_+ ]detector', # Must be before harvest 'harvest', 'htdig', 'jeeves', 'linkwalker', 'lilina', 'lycos[_+ ]', 'moget', 'muscatferret', 'myweb', 'nomad', 'scooter', 'slurp', '^voyager\/', 'weblayers', # Common robots (Not in robot file) 'antibot', 'bruinbot', 'digout4u', 'echo!', 'fast\-webcrawler', 'ia_archiver\-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa 'ia_archiver', 'jennybot', 'mercator', 'netcraft', 'msnbot\-media', 'msnbot', 'petersnews', 'relevantnoise\.com', 'unlost_web_crawler', 'voila', 'webbase', 'webcollage', 'cfetch', 'zyborg', # Must be before wisenut 'wisenutbot' ); @RobotsSearchIDOrder_list2 = ( # Less common robots (In robot file) '[^a]fish', 'abcdatos', 'abonti\.com', 'acme\.spider', 'ahoythehomepagefinder', 'ahrefsbot', 'alkaline', 'anthill', 'arachnophilia', 'arale', 'araneo', 'aretha', 'ariadne', 'powermarks', 'arks', 'aspider', 'atn\.txt', 'atomz', 'auresys', 'backrub', 'bbot', 'bigbrother', 'blackwidow', 'blindekuh', 'bloodhound', 'borg\-bot', 'brightnet', 'bspider', 'cactvschemistryspider', 'calif[^r]', 'cassandra', 'cgireader', 'checkbot', 'christcrawler', 'churl', 'cienciaficcion', 'collective', 'combine', 'conceptbot', 'coolbot', 'core', 'cosmos', 'cruiser', 'cusco', 'cyberspyder', 'desertrealm', 'deweb', 'dienstspider', 'digger', 'diibot', 'direct_hit', 'dnabot', 'download_express', 'dragonbot', 'dwcp', 'e\-collector', 'ebiness', 'elfinbot', 'emacs', 'emcspider', 'esther', 'evliyacelebi', 'fastcrawler', 'feedcrawl', 'fdse', 'felix', 'fetchrover', 'fido', 'finnish', 'fireball', 'fouineur', 'francoroute', 'freecrawl', 'funnelweb', 'gama', 'gazz', 'gcreep', 'getbot', 'geturl', 'golem', 'gougou', 'grapnel', 'griffon', 'gromit', 'gulperbot', 'hambot', 'havindex', 'hometown', 'htmlgobble', 'hyperdecontextualizer', 'iajabot', 'iaskspider', 'hl_ftien_spider', 'sogou', 'icjobs\.de', 'iconoclast', 'ilse', 'imagelock', 'incywincy', 'informant', 'infoseek', 'infoseeksidewinder', 'infospider', 'inspectorwww', 'intelliagent', 'irobot', 'iron33', 'israelisearch', 'javabee', 'jbot', 'jcrawler', 'jobo', 'jobot', 'joebot', 'jubii', 'jumpstation', 'kapsi', 'katipo', 'kilroy', 'ko[_+ ]yappo[_+ ]robot', 'kummhttp', 'labelgrabber\.txt', 'larbin', 'legs', 'linkidator', 'linkscan', 'lockon', 'logo_gif', 'macworm', 'magpie', 'marvin', 'mattie', 'mediafox', 'merzscope', 'meshexplorer', 'mindcrawler', 'mnogosearch', 'momspider', 'monster', 'motor', 'muncher', 'mwdsearch', 'ndspider', 'nederland\.zoek', 'netcarta', 'netmechanic', 'netscoop', 'newscan\-online', 'nhse', 'northstar', 'nzexplorer', 'objectssearch', 'occam', 'octopus', 'openfind', 'orb_search', 'packrat', 'pageboy', 'parasite', 'patric', 'pegasus', 'perignator', 'perlcrawler', 'phantom', 'phpdig', 'piltdownman', 'pimptrain', 'pioneer', 'pitkow', 'pjspider', 'plumtreewebaccessor', 'poppi', 'portalb', 'psbot', 'python', 'raven', 'rbse', 'resumerobot', 'rhcs', 'road_runner', 'robbie', 'robi', 'robocrawl', 'robofox', 'robozilla', 'roverbot', 'rules', 'safetynetrobot', 'search\-info', 'search_au', 'searchprocess', 'senrigan', 'sgscout', 'shaggy', 'shaihulud', 'sift', 'simbot', 'site\-valet', 'sitetech', 'skymob', 'slcrawler', 'smartspider', 'snooper', 'solbot', 'speedy', 'spider[_+ ]monkey', 'spiderbot', 'spiderline', 'spiderman', 'spiderview', 'spry', 'sqworm', 'ssearcher', 'suke', 'sunrise', 'suntek', 'sven', 'tach_bw', 'tagyu_agent', 'tailrank', 'tarantula', 'tarspider', 'techbot', 'templeton', 'titan', 'titin', 'tkwww', 'tlspider', 'ucsd', 'udmsearch', 'universalfeedparser', 'urlck', 'valkyrie', 'verticrawl', 'victoria', 'visionsearch', 'voidbot', 'vwbot', 'w3index', 'w3m2', 'wallpaper', 'wanderer', 'wapspIRLider', 'webbandit', 'webcatcher', 'webcopy', 'webfetcher', 'webfoot', 'webinator', 'weblinker', 'webmirror', 'webmoose', 'webquest', 'webreader', 'webreaper', 'websnarf', 'webspider', 'webvac', 'webwalk', 'webwalker', 'webwatch', 'whatuseek', 'whowhere', 'wired\-digital', 'wmir', 'wolp', 'wombat', 'wordpress', 'worm', 'woozweb', 'wwwc', 'wz101', 'xget', # Other robots reported by users '1\-more_scanner', '360spider', 'a6-indexer', 'accoona\-ai\-agent', 'activebookmark', 'adamm_bot', 'adsbot-google', 'almaden', 'aipbot', 'aleadsoftbot', 'alpha_search_agent', 'allrati', 'aport', 'archive\.org_bot', 'argus', # Must be before nutch 'arianna\.libero\.it', 'aspseek', 'asterias', 'awbot', 'backlinktest\.com', 'baiduspider', 'becomebot', 'bender', 'betabot', 'biglotron', 'bittorrent_bot', 'biz360[_+ ]spider', 'blogbridge[_+ ]service', 'bloglines', 'blogpulse', 'blogsearch', 'blogshares', 'blogslive', 'blogssay', 'bncf\.firenze\.sbn\.it\/raccolta\.txt', 'bobby', 'boitho\.com\-dc', 'bookmark\-manager', 'boris', 'bubing', 'bumblebee', 'candlelight[_+ ]favorites[_+ ]inspector', 'careerbot', 'cbn00glebot', 'cerberian_drtrs', 'cfnetwork', 'cipinetbot', 'checkweb_link_validator', 'commons\-httpclient', 'computer_and_automation_research_institute_crawler', 'converamultimediacrawler', 'converacrawler', 'copubbot', 'cscrawler', 'cse_html_validator_lite_online', 'cuasarbot', 'cursor', 'custo', 'datafountains\/dmoz_downloader', 'dataprovider\.com', 'daumoa', 'daviesbot', 'daypopbot', 'deepindex', 'dipsie\.bot', 'dnsgroup', 'domainchecker', 'domainsdb\.net', 'dulance', 'dumbot', 'dumm\.de\-bot', 'earthcom\.info', 'easydl', 'eccp', 'edgeio\-retriever', 'ets_v', 'exactseek', 'extreme[_+ ]picture[_+ ]finder', 'eventax', 'everbeecrawler', 'everest\-vulcan', 'ezresult', 'enteprise', 'facebook', 'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler 'fast_enterprise_crawler', 'fast\-search\-engine', 'favicon', 'favorg', 'favorites_sweeper', 'feedburner', 'feedfetcher\-google', 'feedflow', 'feedster', 'feedsky', 'feedvalidator', 'filmkamerabot', 'filterdb\.iss\.net', 'findlinks', 'findexa_crawler', 'firmilybot', 'foaf-search\.net', 'fooky\.com\/ScorpionBot', 'g2crawler', 'gaisbot', 'geniebot', 'gigabot', 'girafabot', 'global_fetch', 'gnodspider', 'goforit\.com', 'goforitbot', 'gonzo', 'grapeshot', 'grub', 'gpu_p2p_crawler', 'henrythemiragorobot', 'heritrix', 'holmes', 'hoowwwer', 'hpprint', 'htmlparser', 'html[_+ ]link[_+ ]validator', 'httrack', 'hundesuche\.com\-bot', 'i-bot', 'ichiro', 'iltrovatore\-setaccio', 'infobot', 'infociousbot', 'infohelfer', 'infomine', 'insurancobot', 'integromedb\.org', 'internet[_+ ]ninja', 'internetarchive', 'internetseer', 'internetsupervision', 'ips\-agent', 'irlbot', 'isearch2006', 'istellabot', 'iupui_research_bot', 'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility', 'justview', 'kalambot', 'kamano\.de_newsfeedverzeichnis', 'kazoombot', 'kevin', 'keyoshid', # Must come before Y!J 'kinjabot', 'kinja\-imagebot', 'knowitall', 'knowledge\.com', 'kouaa_krawler', 'krugle', 'ksibot', 'kurzor', 'lanshanbot', 'letscrawl\.com', 'libcrawl', 'linkbot', 'linkdex\.com', 'link_valet_online', 'metager\-linkchecker', # Must be before linkchecker 'linkchecker', 'livejournal\.com', 'lmspider', 'ltbot', 'lwp\-request', 'lwp\-trivial', 'magpierss', 'mail\.ru', 'mapoftheinternet\.com', 'mediapartners\-google', 'megite', 'metaspinner', 'miadev', 'microsoft bits', 'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', 'microsoft[_+ ]url[_+ ]control', 'mini\-reptile', 'minirank', 'missigua_locator', 'misterbot', 'miva', 'mizzu_labs', 'mj12bot', 'mojeekbot', 'msiecrawler', 'ms_search_4\.0_robot', 'msrabot', 'msrbot', 'mt::telegraph::agent', 'mydoyouhike', 'nagios', 'nasa_search', 'netestate ne crawler', 'netluchs', 'netsprint', 'newsgatoronline', 'nicebot', 'nimblecrawler', 'noxtrumbot', 'npbot', 'nutchcvs', 'nutchosu\-vlib', 'nutch', # Must come after other nutch versions 'ocelli', 'octora_beta_bot', 'omniexplorer[_+ ]bot', 'onet\.pl[_+ ]sa', 'onfolio', 'opentaggerbot', 'openwebspider', 'oracle_ultra_search', 'orbiter', 'yodaobot', 'qihoobot', 'passwordmaker\.org', 'pear_http_request_class', 'peerbot', 'perman', 'php[_+ ]version[_+ ]tracker', 'pictureofinternet', 'ping\.blo\.gs', 'plinki', 'pluckfeedcrawler', 'pogodak', 'pompos', 'popdexter', 'port_huron_labs', 'postfavorites', 'projectwf\-java\-test\-crawler', 'proodlebot', 'pyquery', 'rambler', 'redalert', 'rojo', 'rssimagesbot', 'ruffle', 'rufusbot', 'sandcrawler', 'sbider', 'schizozilla', 'scumbot', 'searchguild[_+ ]dmoz[_+ ]experiment', 'searchmetricsbot', 'seekbot', 'semrushbot', 'sensis_web_crawler', 'seokicks\.de', 'seznambot', 'shim\-crawler', 'shoutcast', 'siteexplorer\.info', 'slysearch', 'snap\.com_beta_crawler', 'sohu\-search', 'sohu', # "sohu agent" 'snappy', 'spbot', 'sphere_scout', 'spiderlytics', 'spip', 'sproose_crawler', 'ssearch_bot', 'steeler', 'steroid__download', 'suchfin\-bot', 'superbot', 'surveybot', 'susie', 'syndic8', 'syndicapi', 'synoobot', 'tcl_http_client_package', 'technoratibot', 'teragramcrawlersurf', 'test_crawler', 'testbot', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', 'topicblogs', 'turnitinbot', 'turtlescanner', # Must be before turtle 'turtle', 'tutorgigbot', 'twiceler', 'ubicrawler', 'ultraseek', 'unchaos_bot_hybrid_web_search_engine', 'unido\-bot', 'unisterbot', 'updated', 'ustc\-semantic\-group', 'vagabondo\-wap', 'vagabondo', 'vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch', 'vespa_crawler', 'vortex', 'vse\/', 'w3c\-checklink', 'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', 'w3c_validator', 'watchmouse', 'wavefire', 'waybackarchive\.org', 'webclipping\.com', 'webcompass', 'webcrawl\.net', 'web_downloader', 'webdup', 'webfilter', 'webindexer', 'webminer', 'website[_+ ]monitoring[_+ ]bot', 'webvulncrawl', 'wells_search', 'wesee:search', 'wonderer', 'wume_crawler', 'wwweasel', 'xenu\'s_link_sleuth', 'xenu_link_sleuth', 'xirq', 'y!j', # Must come after keyoshid Y!J 'yacy', 'yahoo\-blogs', 'yahoo\-verticalcrawler', 'yahoofeedseeker', 'yahooseeker\-testing', 'yahooseeker', 'yahoo\-mmcrawler', 'yahoo!_mindset', 'yandex', 'flexum', 'yanga', 'yet-another-spider', 'yooglifetchagent', 'z\-add_link_checker', 'zealbot', 'zhuaxia', 'zspider', 'zeus', 'ng\/1\.', # put at end to avoid false positive 'ng\/2\.', # put at end to avoid false positive 'exabot', # put at end to avoid false positive # Additional bots found by Sussex. '^[1-3]$', # Hiding bots. Doesn't appear to be a valid user agent. 'alltop', 'applesyndication', 'asynchttpclient', 'bingbot', 'blogged_crawl', 'bloglovin', 'butterfly', 'buzztracker', 'carpathia', 'catbot', 'chattertrap', 'check_http', #(nagios) a monitoring tool 'coldfusion', 'covario', 'daylifefeedfetcher', 'discobot', 'dlvr\.it', 'dreamwidth', 'drupal', 'ezoom', 'feedmyinbox', 'feedroll\.com', 'feedzira', 'fever\/', 'freenews', 'geohasher', 'hanrss', 'inagist', 'jacobin club', 'jakarta', 'js\-kit', 'largesmall crawler', 'linkedinbot', 'longurl', 'metauri', 'microsoft\-webdav\-miniredir', '^motorola$', 'movabletype', # These appear to be bots trying to hide. All of the usual architecture data is missing. '^mozilla\/3\.0 \(compatible$', '^mozilla\/4\.0$', '^mozilla\/4\.0 \(compatible;\)$', '^mozilla\/5\.0$', '^mozilla\/5\.0 \(compatible;$', '^mozilla\/5\.0 \(en\-us\)$', '^mozilla\/5\.0 firefox\/3\.0\.5$', '^msie', # End of hiding bots. 'netnewswire', ' netseer ', 'netvibes', 'newrelicpinger', 'newsfox', 'nextgensearchbot', 'ning', 'pingdom', 'pita', 'postpost', 'postrank', 'printfulbot', 'protopage', 'proximic', 'quipply', 'r6\_', 'ratingburner', 'regator', 'rome client', 'rpt\-httpclient', 'rssgraffiti', 'sage\+\+', 'scoutjet', 'simplepie', 'sitebot', 'summify\.com', 'superfeedr', 'synthesio', 'teoma', 'topblogsinfo', 'topix\.net', 'trapit', 'trileet', 'tweetedtimes', 'twisted pagegetter', 'twitterbot', 'twitterfeed', 'unwindfetchor', 'wazzup', 'windows\-rss\-platform', 'wiumi', 'xydo', 'yahoo! slurp', 'yahoo pipes', 'yahoo\-newscrawler', 'yahoocachesystem', 'yahooexternalcache', 'yahoo! searchmonkey', 'yahooysmcm', 'yammer', # 'yandexbot', #already covered by 'yandex' 'yeti', 'yie8', 'youdao', 'yourls', 'zemanta', 'zend_http_client', 'zumbot', # Other id that are 99% of robots 'wget', 'libwww', '^java\/[0-9]' # put at end to avoid false positive ); @RobotsSearchIDOrder_listgen = ( # Generic robot 'robot', 'checker', 'crawl', 'discovery', 'hunter', 'scanner', 'spider', 'sucker', 'bot[\s_+:,\.\;\/\\\-]', '[\s_+:,\.\;\/\\\-]bot', 'curl', 'php', 'ruby\/', 'no_user_agent' ); # RobotsHashIDLib # List of robots names ('robot id','robot clear text') #------------------------------------------------------- %RobotsHashIDLib = ( # Common robots (In robot file) 'appie','Walhello appie', 'architext','ArchitextSpider', 'bingpreview','Bing Preview bot', 'bjaaland','Bjaaland', 'ferret','Wild Ferret Web Hopper #1, #2, #3', 'contentmatch','Yahoo!China ContentMatch Crawler', 'googlebot\-image','Googlebot-Image', 'googlebot','Googlebot', 'google\-sitemaps', 'Google Sitemaps', 'grabber', 'Grabber (SDSC)', 'google[_+ ]web[_+ ]preview', 'Google Web Preview', 'gulliver','Northern Light Gulliver', 'virus[_+ ]detector','virus_detector', 'harvest','Harvest', 'htdig','ht://Dig', 'jeeves','Ask', 'linkwalker','LinkWalker', 'lilina','Lilina', 'lycos[_+ ]','Lycos', 'moget','moget', 'muscatferret','Muscat Ferret', 'myweb','Internet Shinchakubin', 'nomad','Nomad', 'scooter','Scooter', 'slurp','Yahoo Slurp', '^voyager\/','Voyager', 'weblayers','Weblayers', # Common robots (Not in robot file) 'antibot','Antibot', 'bruinbot','The web archive', 'digout4u','Digout4u', 'echo!','EchO!', 'fast\-webcrawler','Fast-Webcrawler', 'ia_archiver\-web\.archive\.org','The web archive (IA Archiver)', 'ia_archiver','Alexa (IA Archiver)', 'jennybot','JennyBot', 'mercator','Mercator', 'msnbot\-media','MSNBot-media', 'msnbot','MSNBot', 'netcraft','Netcraft', 'petersnews','Petersnews', 'unlost_web_crawler','Unlost Web Crawler', 'voila','Voila', 'webbase', 'WebBase', 'zyborg','ZyBorg', 'wisenutbot','WISENutbot', 'webcollage','WebCollage', 'cfetch','Cfetch', # Less common robots (In robot file) '[^a]fish','Fish search', 'abcdatos','ABCdatos BotLink', 'abonti\.com','Abonti WebSearch', 'acme\.spider','Acme.Spider', 'ahoythehomepagefinder','Ahoy! The Homepage Finder', 'ahrefsbot', 'AhrefsBot', 'alkaline','Alkaline', 'anthill','Anthill', 'arachnophilia','Arachnophilia', 'arale','Arale', 'araneo','Araneo', 'aretha','Aretha', 'ariadne','ARIADNE', 'powermarks','Powermarks', # must come before Arks; seen used by referrer spam 'arks','arks', 'aspider','ASpider (Associative Spider)', 'atn\.txt','ATN Worldwide', 'atomz','Atomz.com Search Robot', 'auresys','AURESYS', 'backrub','BackRub', 'bbot','BBot', 'bigbrother','Big Brother', 'blackwidow','BlackWidow', 'blindekuh','Die Blinde Kuh', 'bloodhound','Bloodhound', 'borg\-bot','Borg-Bot', 'brightnet','bright.net caching robot', 'bspider','BSpider', 'cactvschemistryspider','CACTVS Chemistry Spider', 'calif[^r]','Calif', 'cassandra','Cassandra', 'cgireader','Digimarc Marcspider/CGI', 'checkbot','Checkbot', 'christcrawler','ChristCrawler.com', 'churl','churl', 'cienciaficcion','cIeNcIaFiCcIoN.nEt', 'collective','Collective', 'combine','Combine System', 'conceptbot','Conceptbot', 'coolbot','CoolBot', 'core','Web Core / Roots', 'cosmos','XYLEME Robot', 'cruiser','Internet Cruiser Robot', 'cusco','Cusco', 'cyberspyder','CyberSpyder Link Test', 'desertrealm','Desert Realm Spider', 'deweb','DeWeb(c) Katalog/Index', 'dienstspider','DienstSpider', 'digger','Digger', 'diibot','Digital Integrity Robot', 'direct_hit','Direct Hit Grabber', 'dnabot','DNAbot', 'download_express','DownLoad Express', 'dragonbot','DragonBot', 'dwcp','DWCP (Dridus\' Web Cataloging Project)', 'e\-collector','e-collector', 'ebiness','EbiNess', 'elfinbot','ELFINBOT', 'emacs','Emacs-w3 Search Engine', 'emcspider','ananzi', 'esther','Esther', 'evliyacelebi','Evliya Celebi', 'fastcrawler','FastCrawler', 'feedcrawl','FeedCrawl by feed@aobo.com', 'fdse','Fluid Dynamics Search Engine robot', 'felix','Felix IDE', 'fetchrover','FetchRover', 'fido','fido', 'finnish','Finnish', 'fireball','KIT-Fireball', 'fouineur','Fouineur', 'francoroute','Robot Francoroute', 'freecrawl','Freecrawl', 'funnelweb','FunnelWeb', 'gama','gammaSpider, FocusedCrawler', 'gazz','gazz', 'gcreep','GCreep', 'getbot','GetBot', 'geturl','GetURL', 'golem','Golem', 'gougou','GouGou', 'grapnel','Grapnel/0.01 Experiment', 'griffon','Griffon', 'gromit','Gromit', 'gulperbot','Gulper Bot', 'hambot','HamBot', 'havindex','havIndex', 'hometown','Hometown Spider Pro', 'htmlgobble','HTMLgobble', 'hyperdecontextualizer','Hyper-Decontextualizer', 'iajabot','iajaBot', 'iaskspider','Sina Iask Spider', 'hl_ftien_spider','Hylanda', 'sogou','Sogou Spider', 'icjobs\.de', 'iCjobs Spider', #20130805 The user agent string of the icjobs-spider contained the #identifying string only when it accessed the robots.txt file. #When it accessed the actual content it did not identify itself as #a spider. Thus traffic of this spider was counted as user traffic. #The behavious seems to have changed now - the spider identifies itself #when it accesses content pages. 'iconoclast','Popular Iconoclast', 'ilse','Ingrid', 'imagelock','Imagelock', 'incywincy','IncyWincy', 'informant','Informant', 'infoseek','InfoSeek Robot 1.0', 'infoseeksidewinder','Infoseek Sidewinder', 'infospider','InfoSpiders', 'inspectorwww','Inspector Web', 'intelliagent','IntelliAgent', 'ips\-agent', 'ips-agent Verisign(?) - no reliable information found.', 'irobot','I, Robot', 'iron33','Iron33', 'israelisearch','Israeli-search', 'javabee','JavaBee', 'jbot','JBot Java Web Robot', 'jcrawler','JCrawler', 'jobo','JoBo Java Web Robot', 'jobot','Jobot', 'joebot','JoeBot', 'jubii','The Jubii Indexing Robot', 'jumpstation','JumpStation', 'kapsi','image.kapsi.net', 'katipo','Katipo', 'kilroy','Kilroy', 'ko[_+ ]yappo[_+ ]robot','KO_Yappo_Robot', 'kummhttp','KummHttp', 'labelgrabber\.txt','LabelGrabber', 'larbin','larbin', 'legs','legs', 'linkidator','Link Validator', 'linkscan','LinkScan', 'lockon','Lockon', 'logo_gif','logo.gif Crawler', 'macworm','Mac WWWWorm', 'lmspider','lmspider', 'lwp\-request','lwp-request', 'lwp\-trivial','lwp-trivial', 'magpie','MagpieRSS', 'marvin','marvin/infoseek', 'mattie','Mattie', 'mediafox','MediaFox', 'merzscope','MerzScope', 'meshexplorer','NEC-MeshExplorer', 'mindcrawler','MindCrawler', 'mnogosearch','mnoGoSearch search engine software', 'momspider','MOMspider', 'monster','Monster', 'motor','Motor', 'muncher','Muncher', 'mwdsearch','Mwd.Search', 'ndspider','NDSpider', 'nederland\.zoek','Nederland.zoek', 'netcarta','NetCarta WebMap Engine', 'netmechanic','NetMechanic', 'netscoop','NetScoop', 'newscan\-online','newscan-online', 'nhse','NHSE Web Forager', 'northstar','The NorthStar Robot', 'nzexplorer','nzexplorer', 'objectssearch','ObjectsSearch', 'occam','Occam', 'octopus','HKU WWW Octopus', 'openfind','Openfind data gatherer', 'orb_search','Orb Search', 'packrat','Pack Rat', 'pageboy','PageBoy', 'parasite','ParaSite', 'patric','Patric', 'pegasus','pegasus', 'perignator','The Peregrinator', 'perlcrawler','PerlCrawler 1.0', 'phantom','Phantom', 'phpdig','PhpDig', 'piltdownman','PiltdownMan', 'pimptrain','Pimptrain.com\'s robot', 'pioneer','Pioneer', 'pitkow','html_analyzer', 'pjspider','Portal Juice Spider', 'plumtreewebaccessor','PlumtreeWebAccessor', 'poppi','Poppi', 'portalb','PortalB Spider', 'psbot','psbot', 'python','Python-urllib', 'raven','Raven Search', 'rbse','RBSE Spider', 'resumerobot','Resume Robot', 'rhcs','RoadHouse Crawling System', 'road_runner','Road Runner: The ImageScape Robot', 'robbie','Robbie the Robot', 'robi','ComputingSite Robi/1.0', 'robocrawl','RoboCrawl Spider', 'robofox','RoboFox', 'robozilla','Robozilla', 'roverbot','Roverbot', 'rules','RuLeS', 'safetynetrobot','SafetyNet Robot', 'search\-info','Sleek', 'search_au','Search.Aus-AU.COM', 'searchprocess','SearchProcess', 'senrigan','Senrigan', 'sgscout','SG-Scout', 'shaggy','ShagSeeker', 'shaihulud','Shai\'Hulud', 'sift','Sift', 'simbot','Simmany Robot Ver1.0', 'site\-valet','Site Valet', 'sitetech','SiteTech-Rover', 'skymob','Skymob.com', 'slcrawler','SLCrawler', 'smartspider','Smart Spider', 'snooper','Snooper', 'solbot','Solbot', 'speedy','Speedy Spider', 'spider[_+ ]monkey','Spider monkey', 'spiderbot','SpiderBot', 'spiderline','Spiderline Crawler', 'spiderlytics', 'Spiderlytics: No homepage, e-mail only: spider (at) spiderlytics.com', 'spiderman','Spiderman', 'spiderview','SpiderView(tm)', 'spry','Spry Wizard Robot', 'ssearcher','Site Searcher', 'sqworm','Sqworm', 'suke','Suke', 'sunrise','Sunrise', 'suntek','suntek search engine', 'sven','Sven', 'tach_bw','TACH Black Widow', 'tagyu_agent','Tagyu Agent', 'tarantula','Tarantula', 'tarspider','tarspider', 'tailrank','TailRank', 'techbot','TechBOT', 'templeton','Templeton', 'titan','TITAN', 'titin','TitIn', 'tkwww','The TkWWW Robot', 'tlspider','TLSpider', 'ucsd','UCSD Crawl', 'udmsearch','UdmSearch', 'universalfeedparser','UniversalFeedParser', 'urlck','URL Check', 'valkyrie','Valkyrie', 'verticrawl','Verticrawl', 'victoria','Victoria', 'visionsearch','vision-search', 'voidbot','void-bot', 'vwbot','VWbot', 'w3index','The NWI Robot', 'w3m2','W3M2', 'wallpaper','WallPaper (alias crawlpaper)', 'wanderer','the World Wide Web Wanderer', 'wapspider','w@pSpider by wap4.com', 'webbandit','WebBandit Web Spider', 'webcatcher','WebCatcher', 'webcopy','WebCopy', 'webfetcher','webfetcher', 'webfoot','The Webfoot Robot', 'webinator','Webinator', 'weblinker','WebLinker', 'webmirror','WebMirror', 'webmoose','The Web Moose', 'webquest','WebQuest', 'webreader','Digimarc MarcSpider', 'webreaper','WebReaper', 'websnarf','Websnarf', 'webspider','WebSpider', 'webvac','WebVac', 'webwalk','webwalk', 'webwalker','WebWalker', 'webwatch','WebWatch', 'whatuseek','whatUseek Winona', 'whowhere','WhoWhere Robot', 'wired\-digital','Wired Digital', 'wmir','w3mir', 'wolp','WebStolperer', 'wombat','The Web Wombat', 'wordpress','WordPress', 'worm','The World Wide Web Worm', 'woozweb','Woozweb Monitoring', 'wwwc','WWWC Ver 0.2.5', 'wz101','WebZinger', 'xget','XGET', # Other robots reported by users '1\-more_scanner','1-More Scanner', '360spider','360spider', 'a6-indexer', 'A6-Indexer', 'accoona\-ai\-agent','Accoona-AI-Agent', 'activebookmark','ActiveBookmark', 'adamm_bot','AdamM Bot', 'adsbot-google', 'AdsBot-Google', 'almaden','IBM Almaden Research Center WebFountain™', 'aipbot','aipbot', 'aleadsoftbot','ALeadSoftbot', 'alpha_search_agent','Alpha Search Agent', 'allrati','Allrati', 'aport', 'Aport', 'archive\.org_bot','archive.org bot', 'argus','Argus', 'arianna\.libero\.it','arianna.libero.it', 'aspseek','ASPseek', 'asterias', 'Asterias', 'awbot', 'AWBot', 'backlinktest\.com', 'BacklinkCrawler', 'baiduspider','BaiDuSpider', 'becomebot', 'BecomeBot', 'bender','bender focused_crawler', 'betabot','BetaBot', 'biglotron','Biglotron', 'bittorrent_bot','BitTorrent Bot', 'biz360[_+ ]spider','Biz360 spider', 'blogbridge[_+ ]service','BlogBridge Service', 'bloglines','Bloglines', 'blogpulse','BlogPulse ISSpider intelliseek.com', 'blogsearch','BlogSearch', 'blogshares','Blogshares Spiders', 'blogslive','Blogslive', 'blogssay','BlogsSay :: RSS Search Crawler', 'bncf\.firenze\.sbn\.it\/raccolta\.txt','Biblioteca Nazionale Centrale di Firenze', 'bobby', 'Bobby', 'boitho\.com\-dc','boitho.com-dc', 'bookmark\-manager','Bookmark-Manager', 'boris', 'Boris', 'bubing', 'BUbiNG', 'bumblebee', 'Bumblebee (relevare.com)', 'candlelight[_+ ]favorites[_+ ]inspector','Candlelight_Favorites_Inspector', 'careerbot', 'CareerBot', 'cbn00glebot','cbn00glebot', 'cerberian_drtrs','Cerberian Drtrs', 'cfnetwork','CFNetwork', 'cipinetbot','CipinetBot', 'checkweb_link_validator','CheckWeb link validator', 'commons\-httpclient','Jakarta commons-httpclient', 'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', 'converamultimediacrawler','ConveraMultiMediaCrawler', 'converacrawler','ConveraCrawler', 'copubbot', 'CoPubbot', 'cscrawler','CsCrawler', 'cse_html_validator_lite_online','CSE HTML Validator Lite Online','cuasarbot','Cuasarbot', 'cursor','Cursor', 'custo','Custo', 'datafountains\/dmoz_downloader','DataFountains/DMOZ Downloader', 'dataprovider\.com', 'Dataprovider Site Explorer', 'daumoa', 'Daum', 'daviesbot', 'DaviesBot', 'daypopbot', 'DayPop', 'deepindex','Deepindex', 'dipsie\.bot','Dipsie', 'dnsgroup','DNSGroup', 'domainchecker','DomainChecker', 'domainsdb\.net','DomainsDB.net', 'dulance','Dulance', 'dumbot','Dumbot', 'dumm\.de\-bot','dumm.de-Bot', 'earthcom\.info','EARTHCOM.info', 'easydl','EasyDL', 'eccp', 'Eniro Sverige, email: search (at) eniro.com', 'edgeio\-retriever','edgeio-retriever', 'ets_v','ETS Enterprise Translation Server', 'exactseek','ExactSeek Crawler', 'extreme[_+ ]picture[_+ ]finder','Extreme_Picture_Finder', 'eventax','eventax', 'everbeecrawler','EverbeeCrawler', 'everest\-vulcan','Everest-Vulcan', 'ezresult', 'Ezresult', 'enteprise','Fast Enteprise Crawler', 'facebook','FaceBook bot', 'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)', 'fast_enterprise_crawler','FAST Enterprise Crawler', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de', 'favicon','FavIconizer', 'favorg','FavOrg', 'favorites_sweeper','Favorites Sweeper', 'feedburner', 'Feedburner', 'feedfetcher\-google','Feedfetcher-Google', 'feedflow','FeedFlow', 'feedster','Feedster', 'feedsky','FeedSky', 'feedvalidator','FeedValidator', 'filmkamerabot','FilmkameraBot', 'filterdb\.iss\.net', 'oBot', 'findexa_crawler','Findexa Crawler', 'firmilybot', 'Firmily Bot Home page (Website was hacked on Oct. 19, 2013)', 'findlinks','Findlinks', 'foaf-search\.net', 'Friend of a friend (FOAF) search engine', 'fooky\.com\/ScorpionBot','Fooky.com/ScorpionBot/ScoutOut', 'g2crawler','G2Crawler', 'gaisbot','Gaisbot', 'geniebot','Geniebot', 'gigabot','GigaBot', 'girafabot','Girafabot', 'global_fetch','Global Fetch', 'gnodspider','GNOD Spider', 'goforit\.com','GoForIt.com', 'goforitbot','GOFORITBOT', 'gonzo','suchen.de', 'gpu_p2p_crawler','GPU p2p crawler', 'grapeshot', 'Grapeshot Crawler', 'grub','Grub.org', 'henrythemiragorobot', 'Mirago', 'heritrix','Heritrix', 'holmes', 'Holmes', 'hoowwwer','HooWWWer', 'hpprint','HPPrint', 'htmlparser','HTMLParser', 'html[_+ ]link[_+ ]validator','Html_Link_Validator', 'httrack','HTTrack off-line browser', 'hundesuche\.com\-bot','Hundesuche.com-Bot', 'i-bot','i-bot', 'ichiro','ichiro', 'iltrovatore\-setaccio','IlTrovatore-Setaccio', 'infobot','InfoBot', 'infociousbot','InfociousBot', 'infohelfer','Infohelfer', 'infomine','INFOMINE VLCrawler', 'insurancobot','InsurancoBot', 'integromedb\.org','IntegromeDB', 'internet[_+ ]ninja','Internet_Ninja ', 'internetarchive','InternetArchive', 'internetseer', 'InternetSeer', 'internetsupervision','InternetSupervision', 'irlbot','IRLbot', 'isearch2006','isearch2006', 'istellabot', 'IstellaBot', 'iupui_research_bot','IUPUI_Research_Bot', 'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility','JRTwine_Software_Check_Favorites_Utility', 'justview', 'JustView', 'kalambot','KalamBot', 'kamano\.de_newsfeedverzeichnis','kamano.de NewsFeedVerzeichnis', 'kazoombot','KazoomBot', 'kevin','Kevin', 'keyoshid','Yahoo! Japan keyoshid robot study', 'kinjabot', 'Kinjabot', 'kinja\-imagebot', 'Kinja Imagebot', 'knowitall','KnowItAll', 'knowledge\.com','Knowledge.com', 'kouaa_krawler','Kouaa Krawler', 'krugle','Krugle', 'ksibot','ksibot', 'kurzor','Kurzor', 'lanshanbot','lanshanbot', 'letscrawl\.com','LetsCrawl.com', 'libcrawl','Crawl libcrawl', 'link_valet_online','Link Valet Online', 'linkbot','LinkBot', 'linkdex\.com', 'Linkdex', 'linkchecker','LinkChecker', 'livejournal\.com', 'LiveJournal.com', 'ltbot', 'Language Tools Bot (ltbot)', 'magpierss', 'MagpieRSS', 'mail\.ru', 'Mail.ru bot', 'mapoftheinternet\.com','MapoftheInternet.com', 'mediapartners\-google','Google AdSense', 'megite','Megite', 'metager\-linkchecker','MetaGer LinkChecker', 'metaspinner','Metaspinner', 'miadev', 'MiaDev spider', 'microsoft bits', 'Microsoft Background Intelligent Transfer Service (BITS)?', 'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', 'microsoft[_+ ]url[_+ ]control','Microsoft URL Control', 'minirank','miniRank', 'mini\-reptile','Mini-reptile', 'missigua_locator','Missigua_Locator', 'misterbot','Misterbot', 'miva','Miva', 'mizzu_labs','Mizzu Labs', 'mj12bot','MJ12bot', 'mojeekbot','MojeekBot', 'msiecrawler','MSIECrawler', 'ms_search_4\.0_robot','MS SharePoint Portal Server - MS Search 4.0 Robot', 'msrabot','msrabot', 'msrbot','MSRBOT', 'mt::telegraph::agent','MT::Telegraph::Agent', 'mydoyouhike','Mydoyouhike', 'nagios','Nagios', 'nasa_search','NASA Search', 'netestate ne crawler','Website-Datenbank', 'netluchs','Netluchs', 'netsprint','NetSprint', 'newsgatoronline', 'NewsGator Online', 'nicebot','nicebot', 'nimblecrawler','NimbleCrawler', 'noxtrumbot','noxtrumbot', 'npbot','NPBot', 'nutchcvs','NutchCVS', 'nutchosu\-vlib','NutchOSU-VLIB', 'nutch','Nutch', 'ocelli','Ocelli', 'octora_beta_bot','Octora Beta Bot', 'omniexplorer[_+ ]bot','OmniExplorer Bot', 'onet\.pl[_+ ]sa','Onet.pl_SA', 'onfolio','Onfolio', 'opentaggerbot','OpenTaggerBot', 'openwebspider','OpenWebSpider', 'oracle_ultra_search','Oracle Ultra Search', 'orbiter','Orbiter', 'yodaobot','OutfoxBot/YodaoBot', 'qihoobot','QihooBot', 'passwordmaker\.org','passwordmaker.org', 'pear_http_request_class','PEAR HTTP Request class', 'peerbot','PEERbot', 'perman', 'Perman surfer', 'php[_+ ]version[_+ ]tracker','PHP version tracker', 'pictureofinternet','PictureOfInternet', 'ping\.blo\.gs','ping.blo.gs', 'plinki','plinki', 'pluckfeedcrawler','PluckFeedCrawler', 'pogodak','Pogodak.com', 'pompos','Pompos', 'popdexter','Popdexter', 'port_huron_labs','Port Huron Labs', 'postfavorites','PostFavorites', 'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', 'proodlebot','proodleBot', 'pyquery','PyQuery', 'rambler','StackRambler', 'redalert','Red Alert', 'relevantnoise\.com', 'Relevant Noise', 'rojo','RoJo aggregator', 'rssimagesbot','rssImagesBot', 'ruffle','ruffle SemanticWeb crawler', 'rufusbot','RufusBot Rufus Web Miner', 'sandcrawler','SandCrawler (Microsoft)', 'sbider','SBIder', 'schizozilla','Schizozilla', 'scumbot','Scumbot', 'searchguild[_+ ]dmoz[_+ ]experiment','SearchGuild_DMOZ_Experiment', 'searchmetricsbot','SearchmetricsBot', 'seekbot','Seekbot', 'semrushbot', 'SemrushBot', 'sensis_web_crawler','Sensis Web Crawler', 'seokicks\.de', 'SEOkicks Webcrawler', 'seznambot','SeznamBot', 'shim\-crawler','Shim-Crawler', 'shoutcast','Shoutcast Directory Service', 'siteexplorer\.info', 'Site Explorer', 'slysearch','SlySearch', 'snap\.com_beta_crawler','snap.com beta crawler', 'sohu\-search','sohu-search', 'sohu','sohu agent', 'snappy','Snappy', 'spbot', 'SEOprofiler Bot', 'sphere_scout','Sphere Scout', 'spip','SPIP', 'sproose_crawler','sproose crawler', 'ssearch_bot', 'sSearch Crawler', 'steroid__download','STEROID Download', 'steeler','Steeler', 'suchfin\-bot','Suchfin-Bot', 'superbot','SuperBot', 'surveybot','SurveyBot', 'susie','Susie', 'syndic8','Syndic8', 'syndicapi','SyndicAPI', 'synoobot','SynooBot', 'tcl_http_client_package','Tcl http client package', 'technoratibot', 'Technoratibot', 'teragramcrawlersurf','TeragramCrawlerSURF', 'test_crawler','Test Crawler', 'testbot','TestBot', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','T-H-U-N-D-E-R-S-T-O-N-E', 'topicblogs', 'topicblogs', 'turnitinbot','Turn It In', 'turtle', 'Turtle', 'turtlescanner', 'Turtle', 'tutorgigbot','TutorGigBot', 'twiceler','twiceler', 'ubicrawler','UbiCrawler', 'ultraseek', 'Ultraseek', 'unchaos_bot_hybrid_web_search_engine','UnChaos Bot Hybrid Web Search Engine', 'unido\-bot','unido-bot', 'unisterbot', 'UnisterBot; E-Mail only: crawler (at) unister.de', 'updated','updated', 'ustc\-semantic\-group','USTC-Semantic-Group', 'vagabondo\-wap','Vagabondo-WAP', 'vagabondo','Vagabondo', 'vermut','Vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch','versus crawler from eda.baykan@epfl.ch', 'vespa_crawler','Vespa Crawler', 'vortex','VORTEX', 'vse\/','VSE', 'w3c\-checklink','W3C Link Checker', 'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', 'W3C jigsaw CSS Validator', 'w3c_validator','W3C Validator', 'watchmouse', 'WatchMouse Website Monitor', 'wavefire','Wavefire', 'waybackarchive\.org', 'No website, email: spider(at)waybackarchive.org', # 2.12.2013 Project Honeypot reports at least one of the IPs used by waybackarchive with a spiderlytics UA string. # Problably not related to the wayback machine of archive.org. 'webclipping\.com', 'WebClipping.com', 'webcompass', 'webcompass', 'webcrawl\.net','webcrawl.net', 'web_downloader','Web Downloader', 'webdup','Webdup', 'webfilter','WebFilter', 'webindexer','WebIndexer', 'webminer','WebMiner', 'website[_+ ]monitoring[_+ ]bot','Website_Monitoring_Bot', 'webvulncrawl', 'WebVulnCrawl', 'wells_search','Wells Search', 'wesee:search', 'WeSEE Bot', 'wonderer', 'Web Wombat Redback Spider', 'wume_crawler','wume crawler', 'wwweasel',,'WWWeasel', 'xenu\'s_link_sleuth','Xenu Link Sleuth', 'xenu_link_sleuth','Xenu Link Sleuth', 'xirq','xirq', 'y!j', 'Y!J Yahoo Japan', 'yacy','yacy', 'yahoo\-blogs','Yahoo-Blogs', 'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', 'yahoofeedseeker', 'Yahoo Feed Seeker', 'yahooseeker\-testing', 'YahooSeeker-Testing', 'yahooseeker', 'YahooSeeker Yahoo! Blog crawler', 'yahoo\-mmcrawler', 'Yahoo-MMCrawler', 'yahoo!_mindset','Yahoo! Mindset', 'yandex', 'Yandex Bot', 'flexum', 'Flexum Search Engine', 'yanga', 'Yanga WorldSearch Bot', 'yet-another-spider','Yet-Another-Spider', 'yooglifetchagent','yoogliFetchAgent', 'z\-add_link_checker','Z-Add Link Checker', 'zealbot','ZealBot', 'zhuaxia','ZhuaXia', 'zspider','zspider', 'zeus','Zeus Webster Pro', 'zumbot','ZumBot', 'ng\/1\.','NG 1.x (Exalead)', # put at end to avoid false positive 'ng\/2\.','NG 2.x (Exalead)', # put at end to avoid false positive 'exabot','Exabot', # put at end to avoid false positive # Other id that are 99% of robots 'wget','WGet tools', 'libwww','Perl tool', '^java\/[0-9]','Java (Often spam bot)', # put at end to avoid false positive # Generic robot 'robot', 'Unknown robot (identified by \'robot\')', 'checker', 'Unknown robot (identified by \'checker\')', 'crawl', 'Unknown robot (identified by \'crawl\')', 'discovery', 'Unknown robot (identified by \'discovery\')', 'hunter', 'Unknown robot (identified by \'hunter\')', 'scanner', 'Unknown robot (identified by \'scanner\')', 'spider', 'Unknown robot (identified by \'spider\')', 'sucker', 'Unknown robot (identified by \'sucker\')', 'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', '[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'bot\' preceded by a space or one of the following characters _+:,.;/\-)', 'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.', 'php', 'A PHP script', 'ruby\/', 'Ruby script', # Additional bots found by Sussex. '^[1-3]$', 'Generic bot identified as "1", "2" or "3"', 'alltop', 'alltop', 'applesyndication', 'applesyndication', 'asynchttpclient', 'asynchttpclient', 'bingbot', 'Bingbot', 'blogged_crawl', 'blogged_crawl', 'bloglovin', 'bloglovin', 'butterfly', 'butterfly', 'buzztracker', 'buzztracker', 'carpathia', 'carpathia', 'catbot', 'catbot', 'chattertrap', 'chattertrap', 'check_http', 'check_http (nagios)', 'coldfusion', 'coldfusion', 'covario', 'covario', 'daylifefeedfetcher', 'daylifefeedfetcher', 'discobot', 'discobot', 'dlvr\.it', 'dlvr.it', 'dreamwidth', 'dreamwidth', 'drupal', 'Drupal Site', 'ezoom', 'ezoom', 'feedmyinbox', 'feedmyinbox', 'feedroll\.com', 'feedroll.com', 'feedzira', 'feedzira', 'fever\/', 'Feed a Fever', 'freenews', 'freenews', 'geohasher', 'geohasher', 'hanrss', 'hanrss', 'inagist', 'inagist', 'jacobin club', 'jacobin club', 'jakarta', 'jakarta', 'js\-kit', 'js-kit', 'largesmall crawler', 'largesmall crawler', 'linkedinbot', 'linkedinbot', 'longurl', 'longurl', 'metauri', 'metauri', 'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', '^motorola$', 'Suspected Bot masquerading as "Motorola"', 'movabletype', 'movabletype', '^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla', '^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', '^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', '^msie', 'Suspected bot masquerading as M$ IE', 'netnewswire', 'netnewswire', ' netseer ', 'Net Seer', 'netvibes', 'netvibes', 'newrelicpinger', 'newrelicpinger', 'newsfox', 'Fox News', 'nextgensearchbot', 'nextgensearchbot', 'ning', 'ning', 'pingdom', 'pingdom', 'pita', 'pita (pain in the ass?)', 'postpost', 'postpost', 'postrank', 'postrank', 'printfulbot', 'printfulbot', 'protopage', 'protopage', 'proximic', 'Proximic Spider', 'quipply', 'quipply', 'r6\_', 'Radian 6 Crawler', 'ratingburner', 'ratingburner', 'regator', 'regator', 'rome client', 'rome client', 'rpt\-httpclient', 'rpt-httpclient', 'rssgraffiti', 'rssgraffiti', 'sage\+\+', 'sage++', 'scoutjet', 'ScoutJet crawler for Blekko.', 'simplepie', 'simplepie', 'sitebot', 'sitebot', 'summify\.com', 'summify.com', 'superfeedr', 'superfeedr', 'synthesio', 'synthesio', 'teoma', 'teoma', 'topblogsinfo', 'topblogsinfo', 'topix\.net', 'topix.net', 'trapit', 'trapit', 'trileet', 'trileet', 'tweetedtimes', 'The Tweeted Times', 'twisted pagegetter', 'twisted pagegetter', 'twitterbot', 'twitterbot', 'twitterfeed', 'twitterfeed', 'unwindfetchor', 'unwindfetchor', 'wazzup', 'wazzup', 'windows\-rss\-platform', 'windows-rss-platform', 'wiumi', 'wiumi', 'xydo', 'xydo', 'yahoo! slurp', 'Additional Yahoo bots.', 'yahoo pipes', 'Additional Yahoo bots.', 'yahoo\-newscrawler', 'Additional Yahoo bots.', 'yahoocachesystem', 'Additional Yahoo bots.', 'yahooexternalcache', 'Additional Yahoo bots.', 'yahoo! searchmonkey', 'Additional Yahoo bots.', 'yahooysmcm', 'Additional Yahoo bots.', 'yammer', 'yammer', #'yandexbot', 'yandexbot', #already covered by 'yandex' 'yeti', 'yeti', 'yie8', 'yie8', 'youdao', 'youdao', 'yourls', 'yourls', 'zemanta', 'zemanta', 'zend_http_client', 'Zend Http Client', 'no_user_agent','Unknown robot (identified by empty user agent string)', # Unknown robots identified by hit on robots.txt 'unknown', 'Unknown robot (identified by hit on \'robots.txt\')' ); # RobotsAffiliateLib # This list try to tell by which Search Engine a robot is used #------------------------------------------------------------- %RobotsAffiliateLib = ( 'bingpreview'=>'Bing', 'fast\-webcrawler'=>'AllTheWeb', 'googlebot'=>'Google', 'google\-sitemap'=>'Google', 'google[_+ ]web[_+ ]preview'=>'Google', 'msnbot'=>'MSN', 'nutch'=>'Looksmart', 'scooter'=>'AltaVista', 'wisenutbot'=>'Looksmart', 'yahoo\-blogs'=>'Yahoo', 'yahoo\-verticalcrawler'=>'Yahoo', 'yahoofeedseeker'=>'Yahoo', 'yahooseeker\-testing'=>'Yahoo', 'yahooseeker'=>'Yahoo', 'yahoo\-mmcrawler'=>'Yahoo', 'yahoo!_mindset'=>'Yahoo', 'zyborg'=>'Looksmart', 'cfetch'=>'Kosmix', '^voyager\/'=>'Kosmix', # Additional bots found by Sussex. 'feedfetcher\-google'=>'Google', 'bingbot'=>'MSN', 'twitterbot'=>'Twitter', 'twitterfeed'=>'Twitter', 'yahoo! slurp'=>'Yahoo', 'yahoo pipes'=>'Yahoo', 'yahoo-newscrawler'=>'Yahoo', 'yahoocachesystem'=>'Yahoo', 'yahooexternalcache'=>'Yahoo', 'yahoo! searchmonkey'=>'Yahoo', 'yahooysmcm'=>'Yahoo' ); 1;