changeset 28:33e1e3910506

add configurable list of tlds
author carl
date Thu, 27 May 2004 10:08:51 -0700
parents 43a4f6b3e668
children 4dfdf33f1db0
files install.bash sendmail.st src/dnsbl.cpp src/package src/scanner.cpp xml/sample.conf
diffstat 6 files changed, 41 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/install.bash	Sat May 22 22:30:45 2004 -0700
+++ b/install.bash	Thu May 27 10:08:51 2004 -0700
@@ -33,6 +33,9 @@
 if [ ! -f $DST/html-tags.conf ]; then
     cp html-tags.conf $DST
 fi
+if [ ! -f $DST/tld.conf ]; then
+    cp tld.conf $DST
+fi
 mv -f dnsbl $DST
 cp dnsbl.rc /etc/rc.d/init.d/dnsbl
 chmod 755 /etc/rc.d/init.d/dnsbl
Binary file sendmail.st has changed
--- a/src/dnsbl.cpp	Sat May 22 22:30:45 2004 -0700
+++ b/src/dnsbl.cpp	Thu May 27 10:08:51 2004 -0700
@@ -132,6 +132,7 @@
     char *      tag_limit_message;  // error message for excessive bad html tags
     int         tag_limit;          // limit on bad html tags
     string_set  html_tags;          // set of valid html tags
+    string_set  tlds;               // set of valid tld components
     CONFIG();
     ~CONFIG();
 };
@@ -244,7 +245,7 @@
     authenticated = false;
     have_whites   = false;
     only_whites   = true;
-    memory        = new recorder(&pc->html_tags);
+    memory        = new recorder(&pc->html_tags, &pc->tlds);
     scanner       = new url_scanner(memory);
 }
 mlfiPriv::~mlfiPriv() {
@@ -263,7 +264,7 @@
         authenticated = false;
         have_whites   = false;
         only_whites   = true;
-        memory        = new recorder(&pc->html_tags);
+        memory        = new recorder(&pc->html_tags, &pc->tlds);
         scanner       = new url_scanner(memory);
     }
 }
@@ -845,7 +846,8 @@
 static void load_conf(CONFIG &dc, char *fn) {
     dc.config_files.push_back(fn);
     map<char*, int, ltstr> commands;
-    enum {dummy, content, hostlimit, htmllimit, htmltag, dnsbl, dnsbll, envfrom, envto, include, includedcc};
+    enum {dummy, tld, content, hostlimit, htmllimit, htmltag, dnsbl, dnsbll, envfrom, envto, include, includedcc};
+    commands["tld"        ] = tld;
     commands["content"    ] = content;
     commands["host_limit" ] = hostlimit;
     commands["html_limit" ] = htmllimit;
@@ -874,6 +876,15 @@
             // have a decent command
             bool processed = false;
             switch (commands[cmd]) {
+                case tld: {
+                    char *tld = strtok(NULL, delim);
+                    if (!tld) break;                            // no tld value
+                    char buf[200];
+                    snprintf(buf, sizeof(buf), ".%s", tld);
+                    dc.tlds.insert(register_string(buf));       // leading .
+                    processed = true;
+                    } break;
+
                 case content: {
                     char *suff = strtok(NULL, delim);
                     if (!suff) break;                           // no dns suffix
--- a/src/package	Sat May 22 22:30:45 2004 -0700
+++ b/src/package	Thu May 27 10:08:51 2004 -0700
@@ -1,13 +1,14 @@
 #!/bin/bash
 
-VER=dnsbl-2.2
+VER=dnsbl-2.4
 mkdir $VER
     target1=/home/httpd/html/510sg/util/dnsbl.tar.gz
     target2=/home/httpd/html/510sg/dnsbl.conf
     target3=/home/httpd/html/510sg/dnsbl.html
 
     cp sample.conf    $VER/dnsbl.conf
-    cp html-tags.conf $VER/html-tags.conf
+    cp html-tags.conf $VER
+    cp tld.conf       $VER
     cp *cpp           $VER
     cp *rc            $VER
     cp install.bash   $VER
@@ -20,4 +21,7 @@
     echo scp $target1 ns1:$target1
     echo scp $target2 ns1:$target2
     echo scp $target3 ns1:$target3
+    scp $target1 ams:/tmp/`basename $target1`
+    scp $target2 ams:/tmp/`basename $target2`
+    scp $target3 ams:/tmp/`basename $target3`
 rm -rf $VER
--- a/src/scanner.cpp	Sat May 22 22:30:45 2004 -0700
+++ b/src/scanner.cpp	Thu May 27 10:08:51 2004 -0700
@@ -15,18 +15,20 @@
 struct recorder
 {
     string_set  *html_tags; // valid tags
+    string_set  *tlds;      // valid tlds
     string_set  hosts;
     int         bad_html_tags;
     int         binary_tags;
-    recorder(string_set *html_tags_);
+    recorder(string_set *html_tags_, string_set *tlds_);
     ~recorder();
     void empty();
     void new_url(char *host);
     void new_tag(char *tag);
     void binary();
 };
-recorder::recorder(string_set *html_tags_) {
+recorder::recorder(string_set *html_tags_, string_set *tlds_) {
     html_tags     = html_tags_;
+    tlds          = tlds_;
     bad_html_tags = 0;
     binary_tags   = 0;
 }
@@ -35,6 +37,7 @@
 }
 void recorder::empty() {
     bad_html_tags = 0;
+    binary_tags   = 0;
     discard(hosts);
 }
 void recorder::new_url(char *host) {
@@ -47,7 +50,7 @@
     string_set::iterator i = html_tags->find(tag);
     if (i == html_tags->end()) {
         bad_html_tags++;
-        if (debug_syslog && (bad_html_tags < 10)) {
+        if (debug_syslog && (bad_html_tags < 10) && (binary_tags < 10)) {
             // only log the first 10 bad tags
             char buf[200];
             snprintf(buf, sizeof(buf), "bad html tag %s", tag);
@@ -374,15 +377,6 @@
 };
 
 
-char *tlds[] = {
-    ".com",
-    ".net",
-    ".org",
-    ".biz",
-    ".info",
-    NULL
-};
-
 u_char hex_decode[256] = {
     0,  // 0x00
     0,  // 0x01
@@ -953,15 +947,14 @@
                 pending[--count] = '\0';  // null terminate host name by overwriting the terminator
                 if (!strchr((const char *)pending, '@')) {
                     // not an email address or message id
-                    char *tld;
-                    for (int i=0; (tld = tlds[i]); i++) {
-                        int n = strlen(tld);
-                        if (count > n) {
-                            if (strncasecmp((const char *)(pending+count-n), tld, n) == 0) {
-                                memory->new_url((char*)pending);
-                                break;
-                            }
-                        }
+                    char *p1 = strchr((const char *)pending, '.');
+                    char *p2 = strrchr((const char *)pending, '.');
+                    if (p1 && (p1 != p2)) {
+                        // have two periods, so three components
+                        for (int i=1; i<count; i++) pending[i] = tolower(pending[i]);
+                        // is last component a tld?
+                        string_set::iterator i = memory->tlds->find(p2);
+                        if (i != memory->tlds->end()) memory->new_url((char*)pending);
                     }
                 }
                 st = h_init;
--- a/xml/sample.conf	Sat May 22 22:30:45 2004 -0700
+++ b/xml/sample.conf	Thu May 27 10:08:51 2004 -0700
@@ -4,6 +4,9 @@
 # tokens are separated by spaces or tabs
 #
 #
+# tld:
+#   second token is the tld suffix - com, net, org, etc
+#
 # content:
 #   second token is the dns suffix used for the actual lookups
 #   third  token? is a string enclosed in single quotes, so it
@@ -102,6 +105,7 @@
 host_limit      20                          'Mail containing too many host names rejected'
 html_limit      20                          'Mail containing excessive bad html tags rejected'
 include html-tags.conf
+include tld.conf
 
 
 ##############################################