mirror of
https://github.com/UnickSoft/graphonline.git
synced 2026-04-12 10:46:26 +00:00
first commit
This commit is contained in:
76
wiki/scripts/robots.php
Executable file
76
wiki/scripts/robots.php
Executable file
@@ -0,0 +1,76 @@
|
||||
<?php if (!defined('PmWiki')) exit();
|
||||
/* Copyright 2005-2006 Patrick R. Michaud (pmichaud@pobox.com)
|
||||
This file is part of PmWiki; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published
|
||||
by the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version. See pmwiki.php for full details.
|
||||
|
||||
This file provides various features to allow PmWiki to control
|
||||
what web crawlers (robots) see when they visit the site. Of course
|
||||
it's still possible to control robots at the webserver level
|
||||
and via robots.txt, but this page provides some finer level
|
||||
of control.
|
||||
|
||||
The $MetaRobots variable controls generation of the
|
||||
<meta name='robots' ... /> tag in the head of the HTML document.
|
||||
By default $MetaRobots is set so that robots do not index pages in
|
||||
the Site, SiteAdmin, and PmWiki groups.
|
||||
|
||||
The $RobotPattern variable is used to determine if the user agent
|
||||
accessing the site is a robot, and $IsRobotAgent is set accordingly.
|
||||
By default this pattern identifies Googlebot, Yahoo! Slurp, msnbot,
|
||||
BecomeBot, and HTTrack as robots.
|
||||
|
||||
If the agent is deemed a robot, then the $RobotActions array is
|
||||
checked to see if robots are allowed to perform the given action,
|
||||
and if not the robot is immediately sent an HTTP 403 Forbidden
|
||||
response.
|
||||
|
||||
If $EnableRobotCloakActions is set, then a pattern is added to
|
||||
$FmtP to hide any "?action=" url parameters in page urls
|
||||
generated by PmWiki for actions that robots aren't allowed to
|
||||
access. This can greatly reduce the load on the server by
|
||||
not providing the robot with links to pages that it will be
|
||||
forbidden to index anyway.
|
||||
*/
|
||||
|
||||
## $MetaRobots provides the value for the <meta name='robots' ...> tag.
|
||||
SDV($MetaRobots,
|
||||
($action!='browse' || !PageExists($pagename)
|
||||
|| preg_match('#^PmWiki[./](?!PmWiki$)|^Site(Admin)?[./]#', $pagename))
|
||||
? 'noindex,nofollow' : 'index,follow');
|
||||
if ($MetaRobots)
|
||||
$HTMLHeaderFmt['robots'] =
|
||||
" <meta name='robots' content='\$MetaRobots' />\n";
|
||||
|
||||
## $RobotPattern is used to identify robots.
|
||||
SDV($RobotPattern,'Googlebot|Slurp|msnbot|Teoma|ia_archiver|BecomeBot|HTTrack|MJ12bot|XML Sitemaps|Yandex');
|
||||
SDV($IsRobotAgent,
|
||||
$RobotPattern && preg_match("!$RobotPattern!", @$_SERVER['HTTP_USER_AGENT']));
|
||||
if (!$IsRobotAgent) return;
|
||||
|
||||
## $RobotActions indicates which actions a robot is allowed to perform.
|
||||
SDVA($RobotActions, array('browse' => 1, 'rss' => 1, 'dc' => 1));
|
||||
if (!@$RobotActions[$action]) {
|
||||
$pagename = ResolvePageName($pagename);
|
||||
if (!PageExists($pagename)) {
|
||||
header("HTTP/1.1 404 Not Found");
|
||||
print("<h1>Not Found</h1>");
|
||||
exit();
|
||||
}
|
||||
header("HTTP/1.1 403 Forbidden");
|
||||
print("<h1>Forbidden</h1>");
|
||||
exit();
|
||||
}
|
||||
|
||||
## The following removes any ?action= parameters that robots aren't
|
||||
## allowed to access.
|
||||
if (IsEnabled($EnableRobotCloakActions, 0)) {
|
||||
$p = create_function('$a', 'return (boolean)$a;');
|
||||
$p = join('|', array_keys(array_filter($RobotActions, $p)));
|
||||
$FmtPV['$PageUrl'] =
|
||||
'PUE(($EnablePathInfo)
|
||||
? "\\$ScriptUrl/$group/$name"
|
||||
: "\\$ScriptUrl?n=$group.$name")';
|
||||
$FmtP["/(\\\$ScriptUrl[^#\"'\\s<>]+)\\?action=(?!$p)\\w+/"] = '$1';
|
||||
}
|
||||
Reference in New Issue
Block a user