#!/usr/bin/perl # trainUser.osbf-lua - Examine a single user's mail folders looking for messages to be # trained as either spam or good. # # Author: # Steve Pellegrin (spellegrin at convoglio dot com) # # History: # 1.0 2006-August-6 Original code # 2.0 2007-August-25 Use addHeader PERL script to redeliver # when training cannot be performed # 2.1 2008-September-1 Log pre- and post-training scores # New algorithm to generate unique corpus file names # 2.2 2008-September-29 Rework command line option processing # # Usage: # trainUser.osbf-lua --spam | [--nonspam | --good] # # Description: # This script can be run manually, but it probably makes more sense to run it # periodically as a cron job. Each time it runs, the script checks for # messages to be trained as either spam or non-spam. Each such message is trained, # then resubmitted for delivery. The copy of the message in the special "training" # mail folder is deleted. # # From the user's perspective, once a message has been moved to the appropriate # training folder, it will eventually be redelivered to the correct folder (Inbox or Spam). # # This script will work for either maildir or mbox. use strict; # to catch stupid errors use Sys::Syslog qw(:DEFAULT setlogsock); use Getopt::Long; use Pod::Usage; # If your mail is in mbox format, uncomment the "use" statement below. # You will have to install: # Algorithm::Diff # Text::Diff # FileHandle::Unget # Mail::Mbox::MessageParser #use Mail::Mbox::MessageParser; # ---------- Configuration ------------------------------------------ # Capture the user and home directory my $user = $ENV{'USER'}; my $userHome = $ENV{'HOME'}; # Location of local mail delivery agent (usually maildrop or procmail) my $deliveryAgent = "/usr/local/bin/procmail"; # This is the directory that contains the mail trainer files my $trainerDir = "$userHome/osbf-lua"; # This is the directory that contains the mail trainer cache my $trainerCacheDir = "$trainerDir/cache"; # Where to put copies of trained files my $corpusDir = "$userHome/corpus"; my $corpusSpamDir = "$corpusDir/spam"; my $corpusGoodDir = "$corpusDir/good"; # Do not store files larger than this in the corpus my $maxCorpusFileSize = 1024 * 1024; # Mail folder for messages to be trained as Spam # May be either a maildir directory or a mbox file. my $mailDir = "$userHome/Maildir"; my $spamFolder = "$mailDir/.Train as Spam"; # Mail folder for messages to be trained as Nonspam # May be either a maildir directory or a mbox file. my $nonspamFolder = "$mailDir/.Train as Nonspam"; # The training and classification commands when executed from a user's training directory my $baseCmd = "/usr/local/osbf-lua/spamfilter.lua --udir=. --gdir=/usr/local/osbf-lua --cfgdir=/usr/local/etc/osbf-lua --source=sfid"; my $trainingCmd = "$baseCmd --output=message"; my $classificationCmd = "$baseCmd --score"; my $addTrainingHeaderCmd = "/usr/local/bin/addHeader"; # Training command options my $spamOption = "--learn=spam"; my $nonspamOption = "--learn=nonspam"; # Message headers we need to find in each message my $classificationHeader = "X-OSBF-Lua-Score"; # A token that, when found in the classification header, indicates that # the message is currently classified as Spam. The comparison for this # token is case-sensitive. my $spamClassification = "\[[S|-]\]"; # Classification headers my $spamTrainingHeader = "[S] TS"; my $goodTrainingHeader = "[H] TH"; # # Syslogging options are for verbose mode and errors. # # NOTE: Comment out the $syslog_socktype line if syslogging # does not work on your system. my ($syslogSocktype) = 'unix'; # inet, unix, stream, console my ($syslogFacility) = "mail"; my ($syslogOptions) = "pid"; my ($syslogPriority) = "info"; my ($syslogIdent) = "spamfilter"; # ---------- End of Configuration ----------------------------------- # Extract the command line arguments # GetOptions ( 'spam' => \(my $spam = ''), 'nonspam|good' => \(my $good = ''), 'help' => \(my $help = ''), ) or pod2usage(2); # Print help text and exit if requested. pod2usage(1) if $help; # Can't train both ways at once. pod2usage(1) if ($spam && $good); # But we have to train one way! pod2usage(1) unless ($spam || $good); # Prepare for syslog. setlogsock ($syslogSocktype); openlog ($syslogIdent, $syslogOptions, $syslogFacility); # Define some useful directories. The training directory we # choose depends on our command line argument. my $messageFolder = $spam ? $spamFolder : $nonspamFolder; # change to the user's mail trainer directory `mkdir -p $trainerDir`; chdir("$trainerDir"); # See whether we are dealing with mbox or maildir. my $isMaildir = isMaildirFolder($messageFolder); # Collect messages to train -- one per file. my @messageFiles = $isMaildir ? collectMaildirFiles($messageFolder) : collectMboxFiles($userHome, $messageFolder); # Process each file foreach my $messageFile (@messageFiles) { # Read the file's headers in order to accumulate certain necessary information. open MESSAGE, "<$messageFile" or die "Cannot open $messageFile"; # The "necessary information" my $messageFrom = undef; my $messageSubject = undef; my $messageCacheID = undef; while () { # Terminate the loop if a blank line (indicating the end of the headers) is found. last if (/^$/); # Try to capture info about the message. $messageFrom = $1 if (!defined($messageFrom) && (/^From:\s+(.*)$/)); $messageSubject = $2 if (!defined($messageSubject) && (/^Subject:\s+(\*\*\*TRAIN ME\*\*\*\s*)?(.*)\s*$/)); # Take the last cache id $messageCacheID = $1 if (/<(sfid[^\s]+\@convoglio\.com)>$/); } # Close the message file. close MESSAGE; # Make sure that the corpus directory exists my $finalCorpusDir = ($spam) ? $corpusSpamDir : $corpusGoodDir; `mkdir -p $finalCorpusDir`; # Remove spaces from the training file name. my $processedMessageFile = $messageFile; $processedMessageFile =~ s/ /\\ /g; # If the cached file still exists, link it into the corpus, # otherwise use the current copy. my $cacheFile = "$trainerCacheDir/$messageCacheID"; my $cacheFileExists = (-f $cacheFile); # If the cache file exists, we will train. # Otherwise, we fake training so that we can get the # message redelivered to the user. my $action; my $originalScore; my $finalScore; if ($cacheFileExists) { # Do not put the file in the corpus if it is too large. if (fileSize($cacheFile) <= $maxCorpusFileSize) { my $corpusFile = createCorpusFile($finalCorpusDir); `ln -f $cacheFile $corpusFile`; } # Prepare to train the file my $finalTrainingCmd; if ($spam) { # We are going to train this message as Spam. $finalTrainingCmd = $trainingCmd . " " . $spamOption; } else { # We are going to train this message as Nonspam. $finalTrainingCmd = $trainingCmd . " " . $nonspamOption; } # Find the pre-training score. $originalScore = `$classificationCmd <$processedMessageFile`; # Perform training, then redeliver the message. `$finalTrainingCmd <$processedMessageFile | $deliveryAgent`; # Find the post-training score. $finalScore = `$classificationCmd <$processedMessageFile`; # The action performed depends on the original and final scores. if ($originalScore < 0) { $action = ($finalScore < 0) ? "Reenforce" : "Reclassify"; } else { $action = ($finalScore < 0) ? "Reclassify" : "Reenforce"; } } else { # Find the pre-training score and fake the post-training score. $originalScore = `$classificationCmd <$processedMessageFile`; $finalScore = $originalScore; # Pass the incoming message to our filter that will add a fake training header, # then return the message to the user as if we trained it. $action = "Redeliver"; my $finalClass = $spam ? $spamTrainingHeader : $goodTrainingHeader; `$addTrainingHeaderCmd \"$classificationHeader: $finalScore $finalClass\" <$processedMessageFile | $deliveryAgent`; } # We no longer have any use for the messageFile. `rm -f $processedMessageFile`; # Write the log message. $originalScore = trim($originalScore); $finalScore = trim($finalScore); my $messageStatus = $spam ? "Spam" : "Good"; my $logEntry = "User: $user Status: $action $messageStatus ($originalScore --> $finalScore) From: $messageFrom Subject: $messageSubject"; syslog($syslogPriority=>$logEntry); } exit 0; sub createCorpusFile { my ($dir) = @_; # Construct candidate file names until we find one that # does not already exist. my $file; for (;;) { $file = $dir . "/" . time; last if (!( -e $file )); } $file; } sub isMaildirFolder { # Return true if maildir, false if mbox. my ($fileName) = @_; my $isMaildir = 0; if (-d $fileName) { $isMaildir = 1; } $isMaildir; } sub collectMaildirFiles { # We are called with the name of the outermost maildir # folder. It will have three subfolders, two of which # should be examined for messages. my ($maildirFolder) = @_; my @curFiles = collectFromDirectory("$maildirFolder/cur"); my @newFiles = collectFromDirectory("$maildirFolder/new"); # Return the contents of both directories. (@curFiles, @newFiles); } sub collectFromDirectory { my ($directory) = @_; my @fileList; # It is possible that the directory does not exist. # In this case, return an empty list. if (-e $directory) { # get a list of message files in the directory opendir(MESSAGEDIR, $directory); my @files = readdir(MESSAGEDIR); closedir(MESSAGEDIR); foreach my $file (@files) { # Skip files that begin with a dot. next if ($file =~ /^\./); my $fullFileName = "$directory/$file"; push (@fileList, $fullFileName); } } @fileList; } sub collectMboxFiles { my ($baseDir, $mboxFile) = @_; my @fileList; # It is possible that the mbox file does not exist or is empty. # In this case, return an empty list. if (-s $mboxFile) { # Create the directory that will hold the messages if it does not already exist. my $messageDir = "$baseDir/.train"; if (! -e $messageDir) { mkdir $messageDir, 0700; } # Open the mbox file. my $handle = new FileHandle($mboxFile); my $reader = new Mail::Mbox::MessageParser( { 'file_name' => $mboxFile, 'file_handle' => $handle, 'enable_cache' => 0, 'enable_grep' => 1, } ); die $reader unless ref $reader; # Eat any data before the start of the first email message. $reader->prologue; # Loop through each message in the mbox file. while (!$reader->end_of_file()) { # $message will contain a reference to the message text, # NOT the message text itself. my $message = $reader->read_next_email(); my $messageNumber = $reader->number(); # Create a temp file to hold the message. my $fileName = "$messageDir/message.$$.$messageNumber"; open FILE, ">$fileName" or die; print FILE $$message; close FILE; # Add the temp file to our list of email files. # This file will be deleted once it is trained. push (@fileList, $fileName); } # Clear the contents of the mbox file. $handle->close; open MBOX, ">$mboxFile"; close MBOX; } @fileList; } sub fileSize { # Fetch arguments my ($fileName) = @_; my $size = 0; if (-e $fileName) { $size = (stat $fileName)[7]; } $size; } sub trim() { my ($string) = @_; chomp $string; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } __END__ =head1 NAME trainUser.osbf-lua - Train a user account =head1 SYNOPSIS trainUser.osbf-lua --spam | [--nonspam | --good] | --help =head1 OPTIONS =over 8 =item B<--spam> Train as spam =item B<--good|nonspam> Train as non-spam =item B<--help> Print this text =back =cut