diff src/scanner.cpp @ 28:33e1e3910506

add configurable list of tlds
author carl
date Thu, 27 May 2004 10:08:51 -0700
parents 43a4f6b3e668
children 4dfdf33f1db0
line wrap: on
line diff
--- a/src/scanner.cpp	Sat May 22 22:30:45 2004 -0700
+++ b/src/scanner.cpp	Thu May 27 10:08:51 2004 -0700
@@ -15,18 +15,20 @@
 struct recorder
 {
     string_set  *html_tags; // valid tags
+    string_set  *tlds;      // valid tlds
     string_set  hosts;
     int         bad_html_tags;
     int         binary_tags;
-    recorder(string_set *html_tags_);
+    recorder(string_set *html_tags_, string_set *tlds_);
     ~recorder();
     void empty();
     void new_url(char *host);
     void new_tag(char *tag);
     void binary();
 };
-recorder::recorder(string_set *html_tags_) {
+recorder::recorder(string_set *html_tags_, string_set *tlds_) {
     html_tags     = html_tags_;
+    tlds          = tlds_;
     bad_html_tags = 0;
     binary_tags   = 0;
 }
@@ -35,6 +37,7 @@
 }
 void recorder::empty() {
     bad_html_tags = 0;
+    binary_tags   = 0;
     discard(hosts);
 }
 void recorder::new_url(char *host) {
@@ -47,7 +50,7 @@
     string_set::iterator i = html_tags->find(tag);
     if (i == html_tags->end()) {
         bad_html_tags++;
-        if (debug_syslog && (bad_html_tags < 10)) {
+        if (debug_syslog && (bad_html_tags < 10) && (binary_tags < 10)) {
             // only log the first 10 bad tags
             char buf[200];
             snprintf(buf, sizeof(buf), "bad html tag %s", tag);
@@ -374,15 +377,6 @@
 };
 
 
-char *tlds[] = {
-    ".com",
-    ".net",
-    ".org",
-    ".biz",
-    ".info",
-    NULL
-};
-
 u_char hex_decode[256] = {
     0,  // 0x00
     0,  // 0x01
@@ -953,15 +947,14 @@
                 pending[--count] = '\0';  // null terminate host name by overwriting the terminator
                 if (!strchr((const char *)pending, '@')) {
                     // not an email address or message id
-                    char *tld;
-                    for (int i=0; (tld = tlds[i]); i++) {
-                        int n = strlen(tld);
-                        if (count > n) {
-                            if (strncasecmp((const char *)(pending+count-n), tld, n) == 0) {
-                                memory->new_url((char*)pending);
-                                break;
-                            }
-                        }
+                    char *p1 = strchr((const char *)pending, '.');
+                    char *p2 = strrchr((const char *)pending, '.');
+                    if (p1 && (p1 != p2)) {
+                        // have two periods, so three components
+                        for (int i=1; i<count; i++) pending[i] = tolower(pending[i]);
+                        // is last component a tld?
+                        string_set::iterator i = memory->tlds->find(p2);
+                        if (i != memory->tlds->end()) memory->new_url((char*)pending);
                     }
                 }
                 st = h_init;