# HG changeset patch # User carl # Date 1142187339 28800 # Node ID aa07452e641be8fba2bfb5f4af187b8ddb3e09bd # Parent 0094678a16d099ff9b1940214243d690ba367f7a uribl patch from Jeff Evans diff -r 0094678a16d0 -r aa07452e641b AUTHORS --- a/AUTHORS Sun Jan 08 10:27:24 2006 -0800 +++ b/AUTHORS Sun Mar 12 10:15:39 2006 -0800 @@ -5,3 +5,4 @@ John Gunkel Nigel Horne Stephen Johnson + Jeff Evans diff -r 0094678a16d0 -r aa07452e641b ChangeLog --- a/ChangeLog Sun Jan 08 10:27:24 2006 -0800 +++ b/ChangeLog Sun Mar 12 10:15:39 2006 -0800 @@ -1,5 +1,10 @@ $Id$ +5.13 2006-03-12 + patch from Jeff Evans + add SURBL/URIBL lookups, remove trailing dots from hostnames, + allow ip address literals as hostnames. + 5.12 2006-01-08 Use larger resolver buffer to accomodate spammers with many name servers. A current example is life-all.com which needs to retry in diff -r 0094678a16d0 -r aa07452e641b Makefile.am --- a/Makefile.am Sun Jan 08 10:27:24 2006 -0800 +++ b/Makefile.am Sun Mar 12 10:15:39 2006 -0800 @@ -1,7 +1,7 @@ SUBDIRS = src man html info hackdir = $(sysconfdir)/dnsbl hack_SCRIPTS = dnsbl -hack_DATA = dnsbl.conf hosts-ignore.conf html-tags.conf tld.conf +hack_DATA = dnsbl.conf hosts-ignore.conf html-tags.conf tld.conf cctld.conf CLEANFILES = dnsbl xml/dnsbl xml/Makefile EXTRA_DIST = dnsbl.rc $(hack_DATA) dnsbl.spec $(wildcard xml/h*) $(wildcard xml/M*) $(wildcard xml/d*) diff -r 0094678a16d0 -r aa07452e641b NEWS --- a/NEWS Sun Jan 08 10:27:24 2006 -0800 +++ b/NEWS Sun Mar 12 10:15:39 2006 -0800 @@ -1,5 +1,6 @@ $Id$ +5.13 2006-03-12 add SURBL/URIBL lookups, patch from Jeff Evans 5.12 2006-01-08 use larger resolver buffer to accomodate spammers with many name servers 5.11 2005-12-20 switch to autoconf/automake/docbook 5.10 2005-10-16 fix compile error on FC3 diff -r 0094678a16d0 -r aa07452e641b dnsbl.conf --- a/dnsbl.conf Sun Jan 08 10:27:24 2006 -0800 +++ b/dnsbl.conf Sun Mar 12 10:15:39 2006 -0800 @@ -9,7 +9,8 @@ filter sbl-xbl.spamhaus.org "Mail containing %s rejected - sbl; see http://www.spamhaus.org/query/bl?ip=%s"; ignore { include "hosts-ignore.conf"; }; tld { include "tld.conf"; }; - html_tags { include "html-tags.conf"; }; + cctld { include "cctld.conf"; }; +# html_tags { include "html-tags.conf"; }; html_limit off; host_limit soft 20; }; diff -r 0094678a16d0 -r aa07452e641b dnsbl.spec.in --- a/dnsbl.spec.in Sun Jan 08 10:27:24 2006 -0800 +++ b/dnsbl.spec.in Sun Mar 12 10:15:39 2006 -0800 @@ -97,16 +97,18 @@ %docdir %{_datadir}/doc/@PACKAGE@-@VERSION@ %{_datadir}/doc/@PACKAGE@-@VERSION@ %config(noreplace) %{_sysconfdir}/@PACKAGE@ -%config(noreplace) %{_sysconfdir}/@PACKAGE@/*.conf /etc/rc.d/init.d/@PACKAGE@ %dir %attr(0750,@PACKAGE@,root) /var/run/@PACKAGE@ %changelog -* Sun Dec 18 2005 Carl Byington 1.0 +* Fri Mar 10 2006 Carl Byington 5.13 +- remove redundant entry in files section + +* Sun Dec 18 2005 Carl Byington 5.11 - use autoconf and http://www.fedora.us/docs/rpm-packaging-guidelines.html -* Tue Jan 03 2005 Carl Byington 1.4 +* Tue Jan 03 2005 Carl Byington 4.0 - added hosts-ignore conf file - see RELEASE_NOTES diff -r 0094678a16d0 -r aa07452e641b package --- a/package Sun Jan 08 10:27:24 2006 -0800 +++ b/package Sun Mar 12 10:15:39 2006 -0800 @@ -7,6 +7,7 @@ web=/home/httpd/html/510sg/$NAME distlog=/tmp/distcheck +mkdir -p $web chown --recursive root:root * make -f *cvs ./configure >/dev/null @@ -43,6 +44,7 @@ # add packages to the web site wp=$web/packages wp4=$wp/centos4 + mkdir -p $wp4 rp=/usr/src/redhat mv -f $BALL $wp scp $target:$rp/SRPMS/$NAME-$VER*rpm $wp diff -r 0094678a16d0 -r aa07452e641b src/context.cpp --- a/src/context.cpp Sun Jan 08 10:27:24 2006 -0800 +++ b/src/context.cpp Sun Mar 12 10:15:39 2006 -0800 @@ -50,6 +50,7 @@ char *token_soft; char *token_substitute; char *token_tld; +char *token_cctld; char *token_unknown; char *token_verify; char *token_white; @@ -445,12 +446,12 @@ CONTEXTP CONFIG::find_context(char *to) { context_map::iterator i = env_to.find(to); - if (i != env_to.end()) return (*i).second; // found user@domain.tld key + if (i != env_to.end()) return (*i).second; // found user@domain key char *x = strchr(to, '@'); if (x) { x++; i = env_to.find(x); - if (i != env_to.end()) return (*i).second; // found domain.tld key + if (i != env_to.end()) return (*i).second; // found domain key char y = *x; *x = '\0'; i = env_to.find(to); @@ -553,13 +554,13 @@ char *CONTEXT::find_from(char *from) { char *rc = token_inherit; string_map::iterator i = env_from.find(from); - if (i != env_from.end()) rc = (*i).second; // found user@domain.tld key + if (i != env_from.end()) rc = (*i).second; // found user@domain key else { char *x = strchr(from, '@'); if (x) { x++; i = env_from.find(x); - if (i != env_from.end()) rc = (*i).second; // found domain.tld key + if (i != env_from.end()) rc = (*i).second; // found domain key else { char y = *x; *x = '\0'; @@ -577,12 +578,12 @@ CONTEXTP CONTEXT::find_context(char *from) { context_map::iterator i = env_from_context.find(from); - if (i != env_from_context.end()) return (*i).second; // found user@domain.tld key + if (i != env_from_context.end()) return (*i).second; // found user@domain key char *x = strchr(from, '@'); if (x) { x++; i = env_from_context.find(x); - if (i != env_from_context.end()) return (*i).second; // found domain.tld key + if (i != env_from_context.end()) return (*i).second; // found domain key char y = *x; *x = '\0'; i = env_from_context.find(from); @@ -626,6 +627,11 @@ } +string_set& CONTEXT::get_content_cctlds() { + if (content_cctlds.empty() && parent) return parent->get_content_cctlds(); + return content_cctlds; +} + string_set& CONTEXT::get_content_tlds() { if (content_tlds.empty() && parent) return parent->get_content_tlds(); return content_tlds; @@ -693,6 +699,14 @@ } printf("%s }; \n", indent); } + if (!content_cctlds.empty()) { + printf("%s cctld { \n", indent); + printf("%s ", indent); + for (string_set::iterator i=content_cctlds.begin(); i!=content_cctlds.end(); i++) { + printf("%s; ", *i); + } + printf("\n%s }; \n", indent); + } if (!content_tlds.empty()) { printf("%s tld { \n", indent); printf("%s ", indent); @@ -887,6 +901,16 @@ } if (!tsa(tok, token_semi)) return false; } + else if (have == token_cctld) { + if (!tsa(tok, token_lbrace)) return false; + while (true) { + char *have = tok.next(); + if (!have) break; + if (have == token_rbrace) break; // done + me.add_cctld(have); + } + if (!tsa(tok, token_semi)) return false; + } else if (have == token_tld) { if (!tsa(tok, token_lbrace)) return false; while (true) { @@ -1228,6 +1252,7 @@ // void token_init() { token_black = register_string("black"); + token_cctld = register_string("cctld"); token_content = register_string("content"); token_context = register_string("context"); token_dccfrom = register_string("dcc_from"); diff -r 0094678a16d0 -r aa07452e641b src/context.h --- a/src/context.h Sun Jan 08 10:27:24 2006 -0800 +++ b/src/context.h Sun Mar 12 10:15:39 2006 -0800 @@ -100,6 +100,7 @@ char * content_message; // "" string_set content_host_ignore;// hosts to ignore for content sbl checking string_set content_tlds; // + string_set content_cctlds; // string_set html_tags; // set of valid html tags int host_limit; // limit on host names char * host_limit_message; // error message for excessive host names @@ -136,6 +137,7 @@ void set_content_message(char *message) {content_message = message;}; void add_ignore(char *host) {content_host_ignore.insert(host);}; void add_tld(char *tld) {content_tlds.insert(tld);}; + void add_cctld(char *cctld) {content_cctlds.insert(cctld);}; void set_host_limit(int limit) {host_limit = limit;}; void set_host_message(char *message) {host_limit_message = message;}; @@ -155,6 +157,7 @@ char* get_content_message(); string_set& get_content_host_ignore(); string_set& get_content_tlds(); + string_set& get_content_cctlds(); string_set& get_html_tags(); dnsblp_list& get_dnsbl_list(); @@ -186,6 +189,7 @@ }; extern char *token_black; +extern char *token_cctld; extern char *token_content; extern char *token_context; extern char *token_dccfrom; @@ -206,8 +210,8 @@ extern char *token_mailhost; extern char *token_many; extern char *token_off; +extern char *token_ok2; extern char *token_ok; -extern char *token_ok2; extern char *token_on; extern char *token_rbrace; extern char *token_semi; diff -r 0094678a16d0 -r aa07452e641b src/dnsbl.cpp --- a/src/dnsbl.cpp Sun Jan 08 10:27:24 2006 -0800 +++ b/src/dnsbl.cpp Sun Mar 12 10:15:39 2006 -0800 @@ -365,7 +365,7 @@ if (!memory) { // first recipient that needs content filtering sets all // the content filtering parameters - memory = new recorder(this, con.get_html_tags(), con.get_content_tlds()); + memory = new recorder(this, con.get_html_tags(), con.get_content_tlds(), con.get_content_cctlds()); scanner = new url_scanner(memory); content_suffix = con.get_content_suffix(); content_message = con.get_content_message(); @@ -650,6 +650,83 @@ //////////////////////////////////////////////// +// lookup the domain name part of a hostname on two lists +// +bool uriblookup(mlfiPriv &priv ,char *hostname, char *top) ; +bool uriblookup(mlfiPriv &priv, char *hostname, char *top) { + // top is pointer to '.' char at end of base domain, or null for ip address form + // so for hostname of www.fred.mydomain.co.uk + // top points to-----------------------^ + // and we end up looking at only mydomain.co.uk, ignoring the www.fred stuff + char buf[maxlen]; + char buf2[maxlen]; + const char *uriblname[2] = { "multi.surbl.org", "multi.uribl.com" }; + + if (top) { + // add one more component + *top = '\0'; + char *x = strrchr(hostname, '.'); + if (x) hostname = x+1; + *top = '.'; + } + for (int i=0; i<2; i++) { + snprintf(buf, sizeof(buf), "%s.%s", hostname, uriblname[i]); + if (debug_syslog > 2) { + char tmp[maxlen]; + snprintf(tmp, sizeof(tmp), "Looking up %s on %s", hostname, uriblname[i]); + my_syslog(tmp); + } + if (dns_interface(priv, buf, false, NULL)) return true; + } + return false; +} + + +//////////////////////////////////////////////// +// uribl checker +// ------------- +// hostname MUST not have a trailing dot +// If tld, two level lookup. +// Else, look up three level domain. +bool check_uribl(mlfiPriv &priv, char *hostname) ; +bool check_uribl(mlfiPriv &priv, char *hostname) { + in_addr ip; + if (inet_aton(hostname, &ip)) { + char adr[sizeof "255.255.255.255"]; + adr[0] = '\0'; + inet_ntop(AF_INET, (const u_char *)&ip, adr, sizeof(adr)); + return (uriblookup(priv, adr, NULL)); + } + + char *top, *top2, *top3; + top = strrchr(hostname, '.'); + if (top) { + *top = '\0'; + top2 = strrchr(hostname, '.'); + *top = '.'; + + if (top2) { + string_set::iterator i = priv.memory->get_cctlds()->find(top2+1); + string_set::iterator x = priv.memory->get_cctlds()->end(); + // if we have a 2-level-cctld, just look at top three levels of the name + if (i != x) return uriblookup(priv, hostname, top2); + + *top2 = '\0'; + top3 = strrchr(hostname, '.'); + *top2 = '.'; + + // if we have more than 3 levels in the name, look at the top three levels of the name + if (top3 && uriblookup(priv, hostname, top2)) return true; + // if that was not found, fall thru to looking at the top two levels + } + // look at the top two levels of the name + return uriblookup(priv, hostname, top); + } + return false; +} + + +//////////////////////////////////////////////// // check the hosts from the body against the content dnsbl // bool check_hosts(mlfiPriv &priv, bool random, int limit, char *&host, int &ip); @@ -700,10 +777,11 @@ if (ip) { int_set::iterator i = ips.find(ip); if (i == ips.end()) { + // we haven't looked this up yet ips.insert(ip); - if (check_single(priv, ip, suffix)) { - return true; - } + if (check_single(priv, ip, suffix)) return true; + // Check uribl & surbl + if (check_uribl(priv, host)) return true; } } } diff -r 0094678a16d0 -r aa07452e641b src/scanner.cpp --- a/src/scanner.cpp Sun Jan 08 10:27:24 2006 -0800 +++ b/src/scanner.cpp Sun Mar 12 10:15:39 2006 -0800 @@ -92,6 +92,7 @@ fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); void push(u_char *buf, int len); void pusher(); + void validhost(); void error(char *err); }; @@ -1141,10 +1142,11 @@ //////////////////////////////////////////////// // // -recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) { +recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_) { priv = priv_; html_tags = &html_tags_; tlds = &tlds_; + cctlds = &cctlds_; bad_html_tags = 0; binary_tags = 0; } @@ -1197,6 +1199,29 @@ count = 0; } +void fsa::validhost () { + // remove trailing dot + if (pending[count-1]== '.') pending[--count] = '\0'; + if (!strchr((const char *)pending, '@')) { + // not an email address or message id + char *p1 = strchr((const char *)pending, '.'); + char *p2 = strrchr((const char *)pending, '.'); + char *p3 = strstr((const char *)pending, ".."); + if (p1 && (p1 != p2) & !p3) { + // have two periods, so at least three components, and no empty components + in_addr ip; + if (inet_aton((const char*)pending, &ip)) + memory->new_url((char*)pending); + else { + for (int i=0; iget_tlds()->find(p2+1); + if (i != memory->get_tlds()->end()) memory->new_url((char*)pending); + } + } + } +} + void fsa::push(u_char *buf, int len) { for (int i=0; i 5) { + // need some minimal length host name + //otherwise binary files likely to generate false positives pending[--count] = '\0'; // null terminate host name by overwriting the terminator - if (!strchr((const char *)pending, '@')) { - // not an email address or message id - char *p1 = strchr((const char *)pending, '.'); - char *p2 = strrchr((const char *)pending, '.'); - char *p3 = strstr((const char *)pending, ".."); - if (p1 && (p1 != p2) & !p3) { - // have two periods, so at least three components, and no empty components - for (int i=0; iget_tlds()->find(p2+1); - if (i != memory->get_tlds()->end()) memory->new_url((char*)pending); - } + validhost(); } st = h_init; } // fall thru @@ -1265,18 +1282,16 @@ ////////////////////////////// // url recognizer case u_reco: { - if (count > 13) { // need some minimal length host name after the protocol + if (count > 11) { // need some minimal length host name after the protocol pending[--count] = '\0'; // null terminate host name by overwriting the terminator + // must start with protocol + if (strncasecmp((const char *)pending, "http", 4) == 0) { char *p = strrchr((const char *)pending, '/'); - if (p && // have a leading / - strchr(p, '.') && // require at least one . in a dns name - !strstr(p, "..") && // no empty components in the dns name - (strncasecmp((const char *)pending, "http", 4) == 0)) { // must start with protocol - // we seem to have a host name - p++; // skip the last / - int c = strlen(p); - for (int i=0; inew_url(p); // record it + if (p) { + count = strlen(p+1); + memmove(pending, p+1, count+1); + validhost(); + } } } st = u_init; diff -r 0094678a16d0 -r aa07452e641b src/scanner.h --- a/src/scanner.h Sun Jan 08 10:27:24 2006 -0800 +++ b/src/scanner.h Sun Mar 12 10:15:39 2006 -0800 @@ -11,18 +11,20 @@ mlfiPriv *priv; // needed for syslog string_set *html_tags; // valid tags string_set *tlds; // valid tlds + string_set *cctlds; // valid cctlds string_set hosts; int bad_html_tags; int binary_tags; public: - recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_); + recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_); ~recorder() { empty(); }; void empty(); void new_url(char *host); void new_tag(char *tag); void binary(); mlfiPriv *get_priv() { return priv; }; + string_set *get_cctlds() { return cctlds; }; string_set *get_tlds() { return tlds; }; string_set &get_hosts() { return hosts; }; bool excessive_bad_tags(int limit) { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };