#!/opt/i386-linux/perl/bin/perl -w # # ssi_html_file_mode_fixup.pl # # Copyright (C) 2002 Blair Zajac. All rights reserved. # Author: Blair Zajac . # # $HeadURL$ # $LastChangedBy$ # $LastChangedDate$ # $LastChangedRevision$ # # Distributed under the terms of the GNU General Public License, # version 2, which is available for download at # http://www.gnu.org/licenses/gpl.html. I, Blair Zajac, am willing to # consider licensing this script under different terms. If you are # interested, please feel free to contact me. # # This script is used to enable the caching of HTML pages on an Apache # web server when the HTML pages are processed for server side # includes (SSI). The script modifies the HTML file's last modified # time to be at least as new as the newest file it includes and also # sets or unsets its user and group execute bits as needed if the HTML # file does or does not use SSI. This enables effective and accurate # use of the XBitHack configuration option to make the SSI HTML pages # cachable. # # Server side includes have the form of # # # # and are directives that are placed in HTML pages and evaluated on # the server while the pages are being served. They let you add # dynamically generated content to an existing HTML page, without # having to serve the entire page via a CGI program, or other dynamic # technology. # # By default, when Apache serves SSI processed HTML pages, the server # does not send out a `Last-Modified' HTTP header which prevents the # page from being cached. # # One solution to this is to use Apache's XBitHack, which controls the # parsing of ordinary HTML documents using the Unix user and group # executable bits. The XBitHack settings are: # off # No special treatment of executable files. # on # Any file that has the user-execute bit set will be treated as a # server-parsed html document. # full # As for on but also test the group-execute bit. If it is set, # then set the Last-Modified date of the returned file to be the # last modified time of the file. If it is not set, then no # Last-Modified date is sent. Setting this bit allows clients and # proxies to cache the result of the request. # # However, using the XBitHack sends out a last modified time of the # HTML file, not the newest last modified time of the included files. # So if an included file changes or is newer than the HTML file, then # the Last-Modified time of the HTML file does not change. # # This script solves this problem. The first argument to this script # is the DocumentRoot for the HTML files being processed. This is # used to find the included files, which are typically listed as # absolute paths relative relative to the DocumentRoot in the real # filesystem. # # Any remaining arguments are read by this script as HTML files for # server side includes and then appropriately sets the file's modified # time and its user and group execute bits: # o If a file listed on the command line uses any server side # includes, then the user and group execute bits are set. If the # HTML file does not use a single server side include, then the # script will unset the user and group executable bits. # o If a file listed on the command line uses a server side include # and uses to include a file, then # the script checks the last modified time of the included file. # If the HTML file's last modified time is older than the last # modified time of the newest included file, then the HTML file's # last modified time is made equal to the newest last modified # time. # # You may want to run this script from another script like this: # # #!/bin/sh # cd /var/www/public_html # /usr/bin/find . -type f -name \*html -print0 | \ # xargs -0 $HOME/bin/ssi_html_file_mode_fixup.pl . include_* # # and run this script from cron. Here, in /var/www/public_html the # files that are included by other HTML files are named include_*. # They should also be processed because some may contain server side # include commands and others may not and the user and group execute # bits should be set appropriately to save processing time. Finally, # the script processes any files ending in html. use strict; use Carp; use HTML::Parser 3.26; &usage("$0: too few arguments") if @ARGV < 2; my $document_root = shift; unless (-e $document_root) { die "$0: document root `$document_root' does not exist.\n"; } unless (-d $document_root) { die "$0: document root `$document_root' is not a directory.\n"; } # These are global variables used to pass information from the # HTML::Parser callbacks to the main routine. The @include_mtimes # array holds the last modified times of any files included by the # HTML currently being parsed. my @callback_include_mtimes; my $callback_uses_ssi; foreach my $html_file (@ARGV) { # Reset the variables that the callback may modify. $callback_uses_ssi = ''; @callback_include_mtimes = (); open(HTML, $html_file) or die "$0: cannot open `$html_file' for reading: $!\n"; # Create the parser object. my $p = HTML::Parser->new(api_version => 3, comment_h => [ \&parse_comment, "text"]) or die "$0: cannot create HTML::Parser.\n"; # Parse the file. $p->parse_file(\*HTML); close(HTML) or die "$0: error in closing `$html_file' for reading: $!\n"; my @html_stat = stat($html_file) or die "$0: cannot stat `$html_file': $!\n"; # If there are no included files and the file does not use server # side includes, then remove the user and executable bits to save # processing time. my $html_old_mode = $html_stat[2] & 0777; if (!$callback_uses_ssi and @callback_include_mtimes == 0) { my $html_new_mode = $html_old_mode & 07666; if ($html_new_mode != $html_old_mode) { print "Removing ug+x permissions on $html_file.\n"; chmod($html_new_mode, $html_file) or die "$0: cannot chmod `$html_file': $!\n"; } next; } # Make sure this file has it's executable bits turned on. { my $html_new_mode = $html_old_mode | 0111; if ($html_new_mode != $html_old_mode) { print "Adding ug+x permissions on $html_file.\n"; chmod($html_new_mode, $html_file) or die "$0: cannot chmod `$html_file': $!\n"; } } # Find the maximum last modified time from the included files. my $max_mtime = 0; foreach my $include_mtime (@callback_include_mtimes) { $max_mtime = $include_mtime if $include_mtime > $max_mtime; } # If the maximum include last modified time is newer than this HTML # file's last modified time, then update the time. files, then # update the time. my $html_atime = $html_stat[8]; my $html_mtime = $html_stat[9]; if ($max_mtime > $html_mtime) { print "Updating $html_file mtime from `", scalar localtime($html_mtime), "' to `", scalar localtime($max_mtime), "'.\n"; utime $html_stat[8], $max_mtime, $html_file or die "$0: cannot utime `$html_file': $!\n"; } } exit 0; sub parse_comment { unless (@_ == 1) { croak "$0: parse_comment passed incorrect number of arguments.\n"; } my $text = shift; # Check if any server side includes are being used. if ($text =~ m/^