#!/opt/i386-linux/perl/bin/perl -w
#
# ssi_html_file_mode_fixup.pl
#
# Copyright (C) 2002 Blair Zajac. All rights reserved.
# Author: Blair Zajac .
#
# $HeadURL$
# $LastChangedBy$
# $LastChangedDate$
# $LastChangedRevision$
#
# Distributed under the terms of the GNU General Public License,
# version 2, which is available for download at
# http://www.gnu.org/licenses/gpl.html. I, Blair Zajac, am willing to
# consider licensing this script under different terms. If you are
# interested, please feel free to contact me.
#
# This script is used to enable the caching of HTML pages on an Apache
# web server when the HTML pages are processed for server side
# includes (SSI). The script modifies the HTML file's last modified
# time to be at least as new as the newest file it includes and also
# sets or unsets its user and group execute bits as needed if the HTML
# file does or does not use SSI. This enables effective and accurate
# use of the XBitHack configuration option to make the SSI HTML pages
# cachable.
#
# Server side includes have the form of
#
#
#
# and are directives that are placed in HTML pages and evaluated on
# the server while the pages are being served. They let you add
# dynamically generated content to an existing HTML page, without
# having to serve the entire page via a CGI program, or other dynamic
# technology.
#
# By default, when Apache serves SSI processed HTML pages, the server
# does not send out a `Last-Modified' HTTP header which prevents the
# page from being cached.
#
# One solution to this is to use Apache's XBitHack, which controls the
# parsing of ordinary HTML documents using the Unix user and group
# executable bits. The XBitHack settings are:
# off
# No special treatment of executable files.
# on
# Any file that has the user-execute bit set will be treated as a
# server-parsed html document.
# full
# As for on but also test the group-execute bit. If it is set,
# then set the Last-Modified date of the returned file to be the
# last modified time of the file. If it is not set, then no
# Last-Modified date is sent. Setting this bit allows clients and
# proxies to cache the result of the request.
#
# However, using the XBitHack sends out a last modified time of the
# HTML file, not the newest last modified time of the included files.
# So if an included file changes or is newer than the HTML file, then
# the Last-Modified time of the HTML file does not change.
#
# This script solves this problem. The first argument to this script
# is the DocumentRoot for the HTML files being processed. This is
# used to find the included files, which are typically listed as
# absolute paths relative relative to the DocumentRoot in the real
# filesystem.
#
# Any remaining arguments are read by this script as HTML files for
# server side includes and then appropriately sets the file's modified
# time and its user and group execute bits:
# o If a file listed on the command line uses any server side
# includes, then the user and group execute bits are set. If the
# HTML file does not use a single server side include, then the
# script will unset the user and group executable bits.
# o If a file listed on the command line uses a server side include
# and uses to include a file, then
# the script checks the last modified time of the included file.
# If the HTML file's last modified time is older than the last
# modified time of the newest included file, then the HTML file's
# last modified time is made equal to the newest last modified
# time.
#
# You may want to run this script from another script like this:
#
# #!/bin/sh
# cd /var/www/public_html
# /usr/bin/find . -type f -name \*html -print0 | \
# xargs -0 $HOME/bin/ssi_html_file_mode_fixup.pl . include_*
#
# and run this script from cron. Here, in /var/www/public_html the
# files that are included by other HTML files are named include_*.
# They should also be processed because some may contain server side
# include commands and others may not and the user and group execute
# bits should be set appropriately to save processing time. Finally,
# the script processes any files ending in html.
use strict;
use Carp;
use HTML::Parser 3.26;
&usage("$0: too few arguments") if @ARGV < 2;
my $document_root = shift;
unless (-e $document_root) {
die "$0: document root `$document_root' does not exist.\n";
}
unless (-d $document_root) {
die "$0: document root `$document_root' is not a directory.\n";
}
# These are global variables used to pass information from the
# HTML::Parser callbacks to the main routine. The @include_mtimes
# array holds the last modified times of any files included by the
# HTML currently being parsed.
my @callback_include_mtimes;
my $callback_uses_ssi;
foreach my $html_file (@ARGV) {
# Reset the variables that the callback may modify.
$callback_uses_ssi = '';
@callback_include_mtimes = ();
open(HTML, $html_file)
or die "$0: cannot open `$html_file' for reading: $!\n";
# Create the parser object.
my $p = HTML::Parser->new(api_version => 3,
comment_h => [ \&parse_comment,
"text"])
or die "$0: cannot create HTML::Parser.\n";
# Parse the file.
$p->parse_file(\*HTML);
close(HTML)
or die "$0: error in closing `$html_file' for reading: $!\n";
my @html_stat = stat($html_file)
or die "$0: cannot stat `$html_file': $!\n";
# If there are no included files and the file does not use server
# side includes, then remove the user and executable bits to save
# processing time.
my $html_old_mode = $html_stat[2] & 0777;
if (!$callback_uses_ssi and @callback_include_mtimes == 0) {
my $html_new_mode = $html_old_mode & 07666;
if ($html_new_mode != $html_old_mode) {
print "Removing ug+x permissions on $html_file.\n";
chmod($html_new_mode, $html_file)
or die "$0: cannot chmod `$html_file': $!\n";
}
next;
}
# Make sure this file has it's executable bits turned on.
{
my $html_new_mode = $html_old_mode | 0111;
if ($html_new_mode != $html_old_mode) {
print "Adding ug+x permissions on $html_file.\n";
chmod($html_new_mode, $html_file)
or die "$0: cannot chmod `$html_file': $!\n";
}
}
# Find the maximum last modified time from the included files.
my $max_mtime = 0;
foreach my $include_mtime (@callback_include_mtimes) {
$max_mtime = $include_mtime if $include_mtime > $max_mtime;
}
# If the maximum include last modified time is newer than this HTML
# file's last modified time, then update the time. files, then
# update the time.
my $html_atime = $html_stat[8];
my $html_mtime = $html_stat[9];
if ($max_mtime > $html_mtime) {
print "Updating $html_file mtime from `", scalar localtime($html_mtime),
"' to `", scalar localtime($max_mtime), "'.\n";
utime $html_stat[8], $max_mtime, $html_file
or die "$0: cannot utime `$html_file': $!\n";
}
}
exit 0;
sub parse_comment {
unless (@_ == 1) {
croak "$0: parse_comment passed incorrect number of arguments.\n";
}
my $text = shift;
# Check if any server side includes are being used.
if ($text =~ m/^