# HG changeset patch # User bears # Date 965855957 -3600 # Node ID 8897b7e3b1081909b82a29ae58d28656fd143e0e # Parent 3c71e28c8eef94d7934098c2c78d18dd728ff5f0 [svn] Add article filtering diff -r 3c71e28c8eef -r 8897b7e3b108 docs/noffle.conf.5 --- a/docs/noffle.conf.5 Tue Jul 25 13:14:54 2000 +0100 +++ b/docs/noffle.conf.5 Wed Aug 09 22:19:17 2000 +0100 @@ -1,5 +1,5 @@ .TH noffle.conf 5 -.\" $Id: noffle.conf.5 155 2000-06-24 20:28:01Z bears $ +.\" $Id: noffle.conf.5 189 2000-08-09 21:19:17Z bears $ .SH NAME noffle.conf \- Configuration file for NOFFLE news server @@ -203,6 +203,17 @@ .br Default: no +.TP +.B filter +Add the specified filter to the list of filters to be applied to incoming +articles. Filters are applied in the order in which they appear in +.I /etc/noffle.conf +and are further described in the section +.B FILTERS +below. +.br +Default: No filters + .SH "GROUP NAME WILDCARDS" .B NOFFLE @@ -254,10 +265,110 @@ .I [^]\-] matches any character other than a close bracket or minus sign. +.SH FILTERS + +.B NOFFLE +supports basic filtering on incoming articles. Articles to be downloaded +can be matched against one or more criteria and matching articles are +marked for download using one of the group subscribe modes +.BR full , +.B over +or +.BR thread . +Alternatively the filter may specify that the article mode is +.B discard +in which case neither the article nor the article overview will +be downloaded. + +.PP +A +.B filter +configuration line consist of one or more filter specifications +following the +.B filter +keyword on the line. The available specifications are: + +.PP +.B action += +.IR "full|over|thread|discard" . +Specifies the action to be taken if the filter matches. If not specified, +the default action is as specified by the group's default subscription mode. +.PP +.B group += +.IR "" . +Matches if any group in which the article appears matches the +specified group pattern. +.PP +.B subject += +.IR "" . +Matches if the article subject matches the given regular expression. +See the section on regular expressions below. +.PP +.B from += +.IR "" . +Matches if the article subject matches the given regular expression. +See the section on regular expressions below. +.PP +.B msgid += +.IR "" . +Matches if the article message ID matches the given regular expression. +See the section on regular expressions below. +.PP +.B bytes +< or = or > +.IR . +Matches if the number of bytes in the article is less than, equal to, or +greater than the given number. +.PP +.B lines +< or = or > +.IR . +Matches if the number of lines in the article is less than, equal to, or +greater than the given number. +.PP +.B refs +< or = or > +.IR . +Matches if the number of articles referenced by the article is less +than, equal to, or greater than the given number. +.PP +.B xposts +< or = or > +.IR . +Matches if the number of groups the article is posted to is less +than, equal to, or greater than the given number. + +.PP +For example, the following filters download all articles in groups +in the alt.binaries tree in full if they are < 10k in size, otherwise +downloads overviews. +.PP +.I filter group=alt.binaries.* bytes < 10240 action=full +.br +.I filter group=alt.binaries.* action=over +.PP +This filter discards all articles with a subject resembling +the infamous "$$$ Make Money Now! $$$". +.PP +\fIfilter subject="\\$*.*Make.*[M|m]oney.*\\$" action=discard\fR + +.SH REGULAR EXPRESSIONS + +.B NOFFLE +uses extended POSIX-style regular expressions in its filters. Regular +expressions are a powerful means of describing patterns that match +text. A full description is to be found in +.BR regex (7). .SH SEE ALSO .BR noffle (1) +.BR regex (7) .SH AUTHORS diff -r 3c71e28c8eef -r 8897b7e3b108 src/Makefile.in --- a/src/Makefile.in Tue Jul 25 13:14:54 2000 +0100 +++ b/src/Makefile.in Wed Aug 09 22:19:17 2000 +0100 @@ -72,7 +72,7 @@ bin_PROGRAMS = noffle -noffle_SOURCES = client.c client.h common.h configfile.c configfile.h content.c content.h control.c control.h database.c database.h dynamicstring.c dynamicstring.h fetch.c fetch.h fetchlist.c fetchlist.h group.c group.h itemlist.c itemlist.h lock.c lock.h log.c log.h noffle.c online.c online.h outgoing.c outgoing.h over.c over.h portable.h post.c post.h protocol.c protocol.h pseudo.c pseudo.h request.c request.h server.c server.h util.c util.h wildmat.c wildmat.h +noffle_SOURCES = client.c client.h common.h configfile.c configfile.h content.c content.h control.c control.h database.c database.h dynamicstring.c dynamicstring.h fetch.c fetch.h fetchlist.c fetchlist.h filter.c filter.h group.c group.h itemlist.c itemlist.h lock.c lock.h log.c log.h noffle.c online.c online.h outgoing.c outgoing.h over.c over.h portable.h post.c post.h protocol.c protocol.h pseudo.c pseudo.h request.c request.h server.c server.h util.c util.h wildmat.c wildmat.h noffle_LDADD = -lgdbm @@ -87,7 +87,7 @@ LDFLAGS = @LDFLAGS@ LIBS = @LIBS@ noffle_OBJECTS = client.o configfile.o content.o control.o database.o \ -dynamicstring.o fetch.o fetchlist.o group.o itemlist.o lock.o log.o \ +dynamicstring.o fetch.o fetchlist.o filter.o group.o itemlist.o lock.o log.o \ noffle.o online.o outgoing.o over.o post.o protocol.o pseudo.o \ request.o server.o util.o wildmat.o noffle_DEPENDENCIES = @@ -105,7 +105,7 @@ GZIP_ENV = --best DEP_FILES = .deps/client.P .deps/configfile.P .deps/content.P \ .deps/control.P .deps/database.P .deps/dynamicstring.P .deps/fetch.P \ -.deps/fetchlist.P .deps/group.P .deps/itemlist.P .deps/lock.P \ +.deps/fetchlist.P .deps/filter.P .deps/group.P .deps/itemlist.P .deps/lock.P \ .deps/log.P .deps/noffle.P .deps/online.P .deps/outgoing.P .deps/over.P \ .deps/post.P .deps/protocol.P .deps/pseudo.P .deps/request.P \ .deps/server.P .deps/util.P .deps/wildmat.P diff -r 3c71e28c8eef -r 8897b7e3b108 src/configfile.c --- a/src/configfile.c Tue Jul 25 13:14:54 2000 +0100 +++ b/src/configfile.c Wed Aug 09 22:19:17 2000 +0100 @@ -6,7 +6,7 @@ SPOOLDIR VERSION - $Id: configfile.c 155 2000-06-24 20:28:01Z bears $ + $Id: configfile.c 189 2000-08-09 21:19:17Z bears $ */ #if HAVE_CONFIG_H @@ -17,6 +17,8 @@ #include #include +#include +#include "filter.h" #include "itemlist.h" #include "log.h" #include "util.h" @@ -554,6 +556,184 @@ config.autoSubscribeMode[ config.numAutoSubscribeMode++ ] = entry; } +static const char * +getToken( const char *line, Str value ) +{ + Bool isQuoted; + char quoteChar; + Bool seenEscape; + char *maxVal; + + while ( *line != '\0' && isspace( *line ) ) + line++; + if ( *line == '\0' ) + return NULL; + + maxVal = &value[ MAXCHAR ]; + isQuoted = ( *line == '\'' || *line == '"' ); + if ( isQuoted ) + { + quoteChar = *line; + line++; + + seenEscape = FALSE; + while ( *line != '\0' + && ( *line != quoteChar || seenEscape ) + && value < maxVal ) + { + if ( seenEscape ) + { + *value++ = *line; + seenEscape = FALSE; + } + else + { + if ( *line == '\\' ) + seenEscape = TRUE; + else + *value++ = *line; + } + line++; + } + + if ( *line == quoteChar ) + line++; + } + else + { + while ( *line != '\0' && ! isspace( *line ) && value < maxVal ) + *value++ = *line++; + } + *value = '\0'; + return line; +} + +static void +getFilter( const char *line ) +{ + Str ruleBuf, value; + const char *l; + char *p, *ruleName; + Filter *f; + FilterRule rule; + Bool seenAction; + + f = new_Filter(); + + /* Skip "filter" */ + l = Utl_restOfLn( line, 1 ); + seenAction = FALSE; + + for(;;) + { + while ( *l != '\0' && isspace( *l ) ) + l++; + + if ( *l == '\0' ) + break; + + /* Get the rule title */ + p = ruleBuf; + while ( *l != '\0' && *l != '=' && *l != '<' && *l != '>' ) + *p++ = *l++; + *p = '\0'; + ruleName = Utl_stripWhiteSpace( ruleBuf ); + Utl_toLower( ruleName ); + + if ( *ruleName == '\0' ) + goto synErr; + + /* Do we know this rule? */ + if ( strcmp( ruleName, "group" ) == 0 ) + rule.type = RULE_NEWSGROUP; + else if ( strcmp( ruleName, "subject" ) == 0 ) + rule.type = RULE_SUBJECT; + else if ( strcmp( ruleName, "from" ) == 0 ) + rule.type = RULE_FROM; + else if ( strcmp( ruleName, "msgid" ) == 0 ) + rule.type = RULE_MSGID; + else if ( strcmp( ruleName, "bytes" ) == 0 ) + rule.type = RULE_BYTES_LT; + else if ( strcmp( ruleName, "lines" ) == 0 ) + rule.type = RULE_LINES_LT; + else if ( strcmp( ruleName, "refs" ) == 0 ) + rule.type = RULE_NOREFS_LT; + else if ( strcmp( ruleName, "xposts" ) == 0 ) + rule.type = RULE_XPOSTS_LT; + else if ( strcmp( ruleName, "action" ) != 0 ) + goto synErr; + + if ( rule.type == RULE_BYTES_LT || + rule.type == RULE_LINES_LT || + rule.type == RULE_NOREFS_LT || + rule.type == RULE_XPOSTS_LT ) + { + if ( *l == '=' ) + rule.type += 1; + else if ( *l == '>' ) + rule.type += 2; + else if ( *l != '<' ) + goto synErr; + } + else if ( *l != '=' ) + goto synErr; + + /* Skip past '=' (or '>' or '<') */ + l++; + + /* OK, we now have a valid rule. What value? */ + l = getToken( l, value ); + if ( l == NULL ) + goto synErr; + + if ( strcmp( ruleName, "action" ) == 0 ) + { + if ( seenAction ) + goto synErr; + + Utl_toLower( value ); + if ( strcmp( value, "full" ) == 0 ) + f->action = FILTER_FULL; + else if ( strcmp( value, "over" ) == 0 ) + f->action = FILTER_XOVER; + else if ( strcmp( value, "thread" ) == 0 ) + f->action = FILTER_THREAD; + else if ( strcmp( value, "discard" ) == 0 ) + f->action = FILTER_DISCARD; + seenAction = TRUE; + } + else if ( rule.type == RULE_NEWSGROUP ) + Utl_allocAndCpy( &rule.data.grp, value ); + else if ( rule.type >= RULE_SUBJECT && rule.type <= RULE_MSGID ) + { + if ( regcomp( &rule.data.regex, value, REG_EXTENDED ) != 0 ) + goto synErr; + } + else + { + char * endVal; + + rule.data.amount = strtoul( value, &endVal, 0 ); + if ( *endVal != '\0' && ! isspace( *endVal ) ) + goto synErr; + } + + if ( strcmp( ruleName, "action" ) != 0 ) + { + Log_dbg( "Adding rule type %d value %s", rule.type, value ); + Flt_addRule( f, rule ); + } + } + + Log_dbg( "Adding filter, action %d", f->action ); + Flt_addFilter( f ); + return; + +synErr: + logSyntaxErr( line ); + return; +} + void Cfg_read( void ) { @@ -573,6 +753,7 @@ Utl_stripComment( p ); Utl_cpyStr( lowerLine, p ); Utl_toLower( lowerLine ); + p = lowerLine; if ( *p == '\0' ) continue; if ( sscanf( p, "%s", name ) != 1 ) @@ -600,7 +781,6 @@ else if ( strcmp( "default-auto-subscribe-mode", name ) == 0 ) { getStr( s, p ); - Utl_toLower( s ); if ( ! isValidAutoSubscribeMode( s ) ) { logSyntaxErr( line ); @@ -609,14 +789,8 @@ else strcpy( config.defaultAutoSubscribeMode, s ); } - else if ( strcmp( "server", name ) == 0 ) - /* Server needs line not p, - because password may contain uppercase */ - getServ( line ); else if ( strcmp( "mail-to", name ) == 0 ) getStr( config.mailTo, p ); - else if ( strcmp( "path-header", name ) == 0 ) - getStr( config.pathHeader, p ); else if ( strcmp( "expire", name ) == 0 ) getExpire( p ); else if ( strcmp( "auto-subscribe-mode", name ) == 0 ) @@ -625,6 +799,13 @@ getGroups( p, TRUE ); else if ( strcmp( "omitgroups", name ) == 0 ) getGroups( p, FALSE ); + /* The following need line because they may have uppercase data */ + else if ( strcmp( "server", name ) == 0 ) + getServ( line ); + else if ( strcmp( "path-header", name ) == 0 ) + getStr( config.pathHeader, p ); + else if ( strcmp( "filter", name ) == 0 ) + getFilter( line ); else Log_err( "Unknown config option: %s", name ); } diff -r 3c71e28c8eef -r 8897b7e3b108 src/filter.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/filter.c Wed Aug 09 22:19:17 2000 +0100 @@ -0,0 +1,307 @@ +/* + filter.c + + Article filtering. + + $Id: filter.c 189 2000-08-09 21:19:17Z bears $ +*/ + +#if HAVE_CONFIG_H +#include +#endif + +#include "filter.h" + +#include +#include "common.h" +#include "itemlist.h" +#include "log.h" +#include "wildmat.h" + +struct +{ + int nFilters; + int maxFilters; + const Filter **filters; + Bool needGroups; +} filter = { 0, 0, NULL, FALSE }; + +static unsigned long +countGroups( const char *grps ) +{ + unsigned long res; + + res = 1; + while ( *grps != '\0' ) + { + if ( *grps == ',' ) + res++; + grps++; + } + + return res; +} +static unsigned long +countRefs( const char *refs ) +{ + unsigned long res; + Bool inRef; + + res = 0; + inRef = FALSE; + + while ( *refs != '\0' ) + { + if ( inRef ) + { + if ( *refs == '>' ) + { + inRef = FALSE; + res++; + } + } + else if ( *refs == '<' ) + inRef = TRUE; + refs++; + } + + return res; +} + +/* Check a single rule to see if it passes. */ +static Bool +checkRule( const char *thisGrp, const char *newsgroups, + const Over *ov, const FilterRule *r ) +{ + unsigned long ul; + ItemList *grps; + const char *p; + + switch( r->type ) + { + case RULE_NEWSGROUP: + if ( Wld_match( thisGrp, r->data.grp ) ) + return TRUE; + if ( newsgroups != NULL ) + { + grps = new_Itl( newsgroups, " ,\t" ); + for ( p = Itl_first( grps ); p != NULL; p = Itl_next( grps ) ) + if ( Wld_match( p, r->data.grp ) ) + return TRUE; + del_Itl( grps ); + } + return FALSE; + + case RULE_SUBJECT: + return ( regexec( &r->data.regex, Ov_subj( ov ), 0, NULL, 0 ) == 0 ); + + case RULE_FROM: + return ( regexec( &r->data.regex, Ov_from( ov ), 0, NULL, 0 ) == 0 ); + + case RULE_BYTES_LT: + return ( Ov_bytes( ov ) < r->data.amount ); + + case RULE_BYTES_EQ: + return ( Ov_bytes( ov ) == r->data.amount ); + + case RULE_BYTES_GT: + return ( Ov_bytes( ov ) > r->data.amount ); + + case RULE_LINES_LT: + return ( Ov_lines( ov ) < r->data.amount ); + + case RULE_LINES_EQ: + return ( Ov_lines( ov ) == r->data.amount ); + + case RULE_LINES_GT: + return ( Ov_lines( ov ) > r->data.amount ); + + case RULE_MSGID: + return ( regexec( &r->data.regex, Ov_msgId( ov ), 0, NULL, 0 ) == 0 ); + + case RULE_NOREFS_LT: + ul = countRefs( Ov_ref( ov ) ); + return ( ul < r->data.amount ); + + case RULE_NOREFS_EQ: + ul = countRefs( Ov_ref( ov ) ); + return ( ul == r->data.amount ); + + case RULE_NOREFS_GT: + ul = countRefs( Ov_ref( ov ) ); + return ( ul > r->data.amount ); + + case RULE_XPOSTS_LT: + if ( newsgroups == NULL ) + return FALSE; + ul = countGroups( newsgroups ); + return ( ul < r->data.amount ); + + case RULE_XPOSTS_EQ: + if ( newsgroups == NULL ) + return FALSE; + ul = countGroups( newsgroups ); + return ( ul == r->data.amount ); + + case RULE_XPOSTS_GT: + if ( newsgroups == NULL ) + return FALSE; + ul = countGroups( newsgroups ); + return ( ul > r->data.amount ); + } + + ASSERT( FALSE ); /* Shouldn't get here */ +} + +/* Check a single filter to see if it fires. */ +static Bool +checkFilter( const char *thisGrp, const char *newsgroups, + const Over *ov, const Filter *f ) +{ + int i; + + for ( i = 0; i < f->nRules; i++ ) + if ( ! checkRule( thisGrp, newsgroups, ov, &f->rules[i] ) ) + return FALSE; + + return TRUE; +} + +/* Add a filter to the list of filters. */ +void +Flt_addFilter( const Filter *f ) +{ + ASSERT( f != NULL ); + + if ( ( filter.nFilters + 1 ) > filter.maxFilters ) + { + filter.filters = + ( const Filter ** ) realloc( filter.filters, + ( filter.maxFilters + 5 ) + * sizeof( Filter * ) ); + if ( filter.filters == NULL ) + { + Log_err( "Could not realloc filter list" ); + exit( EXIT_FAILURE ); + } + filter.maxFilters += 5; + } + filter.filters[ filter.nFilters++ ] = f; +} + +/* + * Run the rules over the supplied overview. If a specific rule fires, + * returns its action. If no rule fires, return the default read mode. + */ +FilterAction +Flt_checkFilters( const char *thisGrp, const char *newsgroups, + const Over *ov, FetchMode mode ) +{ + int i; + + for ( i = 0; i < filter.nFilters; i++ ) + if ( checkFilter( thisGrp, newsgroups, ov, filter.filters[ i ] ) ) + { + Log_dbg( "Filter %d fired on message %s", i, Ov_msgId( ov ) ); + return filter.filters[ i ]->action; + } + + switch( mode ) + { + case FULL: return FILTER_FULL; + case THREAD: return FILTER_THREAD; + case OVER: return FILTER_XOVER; + } + + ASSERT( FALSE ); /* Shouldn't get here */ +} + +Filter * +new_Filter( void ) +{ + Filter *f; + + if ( ! ( f = ( Filter * ) malloc( sizeof( Filter ) ) ) ) + { + Log_err( "Cannot allocate Filter" ); + exit( EXIT_FAILURE ); + } + f->nRules = 0; + f->maxRules = 0; + f->rules = NULL; + f->action = FILTER_FULL; + return f; +} + +void +del_Filter( Filter *f ) +{ + if ( f == NULL ) + return; + + if ( f->rules != NULL ) + free( f->rules ); + free( f ); +} + +FilterAction +Flt_action( const Filter *f ) +{ + return f->action; +} + +int +Flt_nRules( const Filter *f ) +{ + return f->nRules; +} + +/* + * Do we have a rule requiring us to fetch the Newsgroups: headers of + * articles? + */ +Bool +Flt_getNewsgroups( void ) +{ + return filter.needGroups; +} + +FilterRule +Flt_rule( const Filter *f, int ruleNo ) +{ + ASSERT( ruleNo < f->nRules ); + return f->rules[ ruleNo ]; +} + +void +Flt_setAction( Filter *f, FilterAction action ) +{ + f->action = action; +} + +void +Flt_addRule( Filter *f, FilterRule rule ) +{ + /* Does the rule require Newsgroups: headers to be fetched? */ + if ( rule.type == RULE_NEWSGROUP || + ( rule.type >= RULE_XPOSTS_LT && rule.type <= RULE_XPOSTS_GT ) ) + filter.needGroups = TRUE; + + if ( f->nRules + 1 > f->maxRules ) + { + f->rules = + ( FilterRule * ) realloc( f->rules, + ( f->maxRules + 5 ) + * sizeof( FilterRule ) ); + + if ( f->rules == NULL ) + { + Log_err( "Could not realloc rule list" ); + exit( EXIT_FAILURE ); + } + f->maxRules += 5; + } + f->rules[ f->nRules++ ] = rule; +} + + diff -r 3c71e28c8eef -r 8897b7e3b108 src/filter.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/filter.h Wed Aug 09 22:19:17 2000 +0100 @@ -0,0 +1,95 @@ +/* + filter.h + + Article filtering. + + $Id: filter.h 189 2000-08-09 21:19:17Z bears $ +*/ + +#ifndef FILTER_H +#define FILTER_H + +#include +#include +#include "fetchlist.h" +#include "over.h" + +/* The possible actions in a filter. */ +typedef enum { + FILTER_FULL, + FILTER_XOVER, + FILTER_THREAD, + FILTER_DISCARD +} FilterAction; + +/* Representation of a rule. */ +typedef enum { + RULE_NEWSGROUP, /* Wildmat data */ + RULE_SUBJECT, /* Regex data */ + RULE_FROM, + RULE_MSGID, + RULE_BYTES_LT, RULE_BYTES_EQ, RULE_BYTES_GT, /* Number data */ + RULE_LINES_LT, RULE_LINES_EQ, RULE_LINES_GT, + RULE_NOREFS_LT, RULE_NOREFS_EQ, RULE_NOREFS_GT, + RULE_XPOSTS_LT, RULE_XPOSTS_EQ, RULE_XPOSTS_GT +} FilterRuleType; + +typedef union { + regex_t regex; + unsigned long amount; + char *grp; +} FilterRuleData; + +typedef struct { + FilterRuleType type; + FilterRuleData data; +} FilterRule; + +/* A single filter is a collection of rules with an action. */ +typedef struct { + int nRules; + int maxRules; + FilterRule *rules; + FilterAction action; +} Filter; + +/* Add a filter to the list of filters. */ +void +Flt_addFilter( const Filter *f ); + +/* + * Run the rules over the supplied overview. If a specific rule fires, + * returns its action. If no rule fires, return the default read mode. + */ +FilterAction +Flt_checkFilters( const char *thisGrp, const char *newsgroups, + const Over *ov, FetchMode mode ); + +/* + * Build and access a filter + */ +Filter * +new_Filter( void ); + +void +del_Filter( Filter *f ); + +FilterAction +Flt_action( const Filter *f ); + +int +Flt_nRules( const Filter *f ); + +Bool +Flt_getNewsgroups( void ); + +FilterRule +Flt_rule( const Filter *f, int ruleNo ); + +void +Flt_setAction( Filter *f, FilterAction action ); + +void +Flt_addRule( Filter *f, FilterRule rule ); + +#endif /* FILTER_H */