#!/usr/bin/perl # retrainUser.osbf-lua - Retrain a user's spam filter # # Author: # Steve Pellegrin (spellegrin at convoglio dot com) # # History: # 1.0 2006-August-6 Original code # 1.1 2008-September-29 Rework command line option processing # # Usage: # retrainUser.osbf-lua [--verbose] [--quiet] [--force] # # Description: # This script can be run manually, but it probably makes more sense to run it # periodically from a cron job. use strict; # to catch stupid errors use Cwd; use List::Util 'shuffle'; use Getopt::Long; use Pod::Usage; # ---------- Configuration ------------------------------------------ # Capture the user and home directory my $user = $ENV{'USER'}; my $userHome = $ENV{'HOME'}; # The number of training passes to make my $trainingPasses = 6; # This is the sub-directory of /home/user that contains the trainer files. my $trainingDir = "osbf-lua"; # These are the names of the spam and nonspam directories for archiving reclassified files. my $corpusDir = "corpus"; my $spamDir = "spam"; my $nonspamDir = "good"; my $indexFileName = "index"; # Flags used for training spam and nonspam my $spamFlag = "--learn=spam"; my $nonspamFlag = "--learn=nonspam"; # Names of the stat files my $spamStatFile = "spam.cfc"; my $nonspamStatFile = "nonspam.cfc"; # Initial stat file size (when doing "force") my $numberOfBuckets = 377287; my $initStatFileCmd = "/usr/local/osbf-lua/create_databases.lua $numberOfBuckets" ; # Training directories my $globalDir = "/usr/local/osbf-lua"; my $configDir = "/usr/local/etc/osbf-lua"; # The training and classification commands my $trainerCmd = "$globalDir/toer.lua ./"; my $classifyCmd = "$globalDir/spamfilter.lua"; # Report if values change by at least this much. my $minDelta = 1.0; # ---------- End of Configuration ----------------------------------- # Extract the command line arguments # GetOptions ( 'verbose' => \(my $verbose = ''), 'quiet' => \(my $quiet = ''), 'force' => \(my $force = ''), 'help' => \(my $help = ''), ) or pod2usage(2); # Print help text and exit if requested. pod2usage(1) if $help; my $corpusSpamDir = "$corpusDir/$spamDir"; my $corpusNonspamDir = "$corpusDir/$nonspamDir"; my $userTrainingDir = "$userHome/$trainingDir"; my $userSpamDir = "$userHome/$corpusSpamDir"; my $userNonspamDir = "$userHome/$corpusNonspamDir"; # Ensure that the directories exist. `mkdir -p $userTrainingDir $userSpamDir $userNonspamDir`; # change to the user's home directory chdir($userHome); # Remember average scores for later reporting my $oldSpamAverageScore = 0; my $oldNonspamAverageScore = 0; my $newSpamAverageScore = 0; my $newNonspamAverageScore = 0; if ((-e "$trainingDir/$spamStatFile") && (-e "$trainingDir/$nonspamStatFile") && !$force) { ($oldSpamAverageScore, $oldNonspamAverageScore) = classifyMessages(); } # Reset stat files if requested if ($force) { print "Reset Stat files for $user\n" if ($verbose); initStatFiles(); } print "Training $user\n" if ($verbose); trainMessages($verbose); print "\n" if ($verbose); # Get final scores ($newSpamAverageScore, $newNonspamAverageScore) = classifyMessages(); # Print summary unless ($quiet) { my $averagesChanged = (abs($oldSpamAverageScore - $newSpamAverageScore) >= $minDelta) || (abs($oldNonspamAverageScore - $newNonspamAverageScore) >= $minDelta); if ($verbose || $averagesChanged) { print "Summary for $user\n"; print " Average Scores\n"; print " Spam: $oldSpamAverageScore -> $newSpamAverageScore\n"; print " Nonspam: $oldNonspamAverageScore -> $newNonspamAverageScore\n"; print "\n\n\n"; } } exit 0; # reset stat files sub initStatFiles { # The current directory is the user's home directory my $formerDir = cwd; chdir($userTrainingDir); unlink $spamStatFile, $nonspamStatFile; `$initStatFileCmd`; chdir ($formerDir); } # get a list of the files in a directory sub getFileList { my ($directoryName) = @_; my @fileList; if (-e $directoryName) { opendir(MESSAGEDIR,$directoryName); @fileList = grep { /^[^\.]/ } readdir(MESSAGEDIR); closedir(MESSAGEDIR); } @fileList; } # classify existing messages sub classifyMessages { # Get lists of spam and nonspam files my @spamFileList = getFileList($corpusSpamDir); my @nonspamFileList = getFileList($corpusNonspamDir); # Count of total messages seen and their total score my $countSpamMessages = 0; my $countNonspamMessages = 0; my $totalSpamScore = 0; my $totalNonspamScore = 0; # Process files for (;;) { my $nextSpamFile = pop(@spamFileList); my $nextNonspamFile = pop(@nonspamFileList); last if (!defined($nextSpamFile) && !defined($nextNonspamFile)); # Spam my $score = classifyFile($spamFlag, $userSpamDir, $nextSpamFile); $countSpamMessages += 1; $totalSpamScore += $score; # Nonspam $score = classifyFile($nonspamFlag, $userNonspamDir, $nextNonspamFile); $countNonspamMessages += 1; $totalNonspamScore += $score; } my $spamAverageScore = ($countSpamMessages == 0) ? 0 : ($totalSpamScore / $countSpamMessages); my $nonspamAverageScore = ($countNonspamMessages == 0) ? 0 : ($totalNonspamScore / $countNonspamMessages); # Return the average scores ($spamAverageScore, $nonspamAverageScore); } # train messages in a directory sub trainMessages { # Make pretty names for input arguments my ($printInfo) = @_; my $formerDir = cwd; chdir($trainingDir); buildTrainingFiles($indexFileName, "../$corpusSpamDir", "../$corpusNonspamDir"); `$trainerCmd`; if ($printInfo) { # Print the stats from the final pass. print `cat toer-lua_training-stats*$indexFileName`; } cleanupFiles($trainingPasses, $indexFileName); chdir ($formerDir); } sub buildTrainingFiles { my ($fileName, $spamDir, $goodDir) = @_; my @spamList = getFileList($spamDir); my @goodList = getFileList($goodDir); my @fileList; for my $spamFile (@spamList) { push @fileList, "spam $spamDir/$spamFile"; } for my $goodFile (@goodList) { push @fileList, "ham $goodDir/$goodFile"; } my @shuffledFileList = shuffle(@fileList); open SHUFFLE, ">$fileName"; foreach my $testFileName (@shuffledFileList) { print SHUFFLE "$testFileName\n"; } close SHUFFLE; } sub cleanupFiles { my ($iterations, $baseName) = @_; # Remove all shuffle files `rm -f $baseName*`; # Remove all but the last log files. for (my $i = 1; $i < $iterations; $i++) { `rm -f toer-lua*$i`; } } sub classifyFile { my ($flag, $dir, $file) = @_; my $score = 0; if (defined($file)) { my $messageFile ="$dir/$file"; # Init some useful stuff my $processedMessageFile = $messageFile; $processedMessageFile =~ s/ /\\ /g; my $baseCmd = "$classifyCmd --udir=$userTrainingDir --gdir=$globalDir --cfgdir=$configDir"; my $statsCommand = "$baseCmd --score <$processedMessageFile"; # Get current classification $score = `$statsCommand`; chomp($score); } $score; } __END__ =head1 NAME retrainUser.osbf-lua - Retrain a user account =head1 SYNOPSIS retrainUser.osbf-lua [options] =head1 OPTIONS =over 8 =item B<--verbose> Print verbose output =item B<--quiet> Print minimal output =item B<--force> Reinitialize cfc files =item B<--help> Print this text =back =cut