first commit

2026-04-13 03:06:11 +00:00 · 2017-04-15 01:34:36 +03:00
commit c715e2a604
5325 changed files with 329700 additions and 0 deletions
--- a/wiki/scripts/robots.php
+++ b/wiki/scripts/robots.php
@@ -0,0 +1,76 @@
+<?php if (!defined('PmWiki')) exit();
+/*  Copyright 2005-2006 Patrick R. Michaud (pmichaud@pobox.com)
+    This file is part of PmWiki; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.  See pmwiki.php for full details.
+
+    This file provides various features to allow PmWiki to control
+    what web crawlers (robots) see when they visit the site.  Of course
+    it's still possible to control robots at the webserver level 
+    and via robots.txt, but this page provides some finer level
+    of control.
+
+    The $MetaRobots variable controls generation of the 
+    <meta name='robots' ... /> tag in the head of the HTML document.
+    By default $MetaRobots is set so that robots do not index pages in
+    the Site, SiteAdmin, and PmWiki groups.
+
+    The $RobotPattern variable is used to determine if the user agent
+    accessing the site is a robot, and $IsRobotAgent is set accordingly.  
+    By default this pattern identifies Googlebot, Yahoo! Slurp, msnbot, 
+    BecomeBot, and HTTrack as robots.
+
+    If the agent is deemed a robot, then the $RobotActions array is
+    checked to see if robots are allowed to perform the given action,
+    and if not the robot is immediately sent an HTTP 403 Forbidden
+    response.
+
+    If $EnableRobotCloakActions is set, then a pattern is added to
+    $FmtP to hide any "?action=" url parameters in page urls
+    generated by PmWiki for actions that robots aren't allowed to
+    access.  This can greatly reduce the load on the server by 
+    not providing the robot with links to pages that it will be 
+    forbidden to index anyway.  
+*/
+
+## $MetaRobots provides the value for the <meta name='robots' ...> tag.
+SDV($MetaRobots,
+      ($action!='browse' || !PageExists($pagename)
+        || preg_match('#^PmWiki[./](?!PmWiki$)|^Site(Admin)?[./]#', $pagename))
+      ? 'noindex,nofollow' : 'index,follow');
+if ($MetaRobots)
+  $HTMLHeaderFmt['robots'] =
+    "  <meta name='robots' content='\$MetaRobots' />\n";
+
+## $RobotPattern is used to identify robots.
+SDV($RobotPattern,'Googlebot|Slurp|msnbot|Teoma|ia_archiver|BecomeBot|HTTrack|MJ12bot|XML Sitemaps|Yandex');
+SDV($IsRobotAgent, 
+  $RobotPattern && preg_match("!$RobotPattern!", @$_SERVER['HTTP_USER_AGENT']));
+if (!$IsRobotAgent) return;
+
+## $RobotActions indicates which actions a robot is allowed to perform.
+SDVA($RobotActions, array('browse' => 1, 'rss' => 1, 'dc' => 1));
+if (!@$RobotActions[$action]) {
+  $pagename = ResolvePageName($pagename);
+  if (!PageExists($pagename)) {
+    header("HTTP/1.1 404 Not Found");
+    print("<h1>Not Found</h1>");
+    exit();
+  }
+  header("HTTP/1.1 403 Forbidden");
+  print("<h1>Forbidden</h1>");
+  exit();
+}
+
+## The following removes any ?action= parameters that robots aren't
+## allowed to access.
+if (IsEnabled($EnableRobotCloakActions, 0)) {
+  $p = create_function('$a', 'return (boolean)$a;');
+  $p = join('|', array_keys(array_filter($RobotActions, $p)));
+  $FmtPV['$PageUrl'] = 
+    'PUE(($EnablePathInfo)
+         ? "\\$ScriptUrl/$group/$name"
+         : "\\$ScriptUrl?n=$group.$name")';
+  $FmtP["/(\\\$ScriptUrl[^#\"'\\s<>]+)\\?action=(?!$p)\\w+/"] = '$1';
+}