Mercurial > dnsbl
comparison src/scanner.cpp @ 28:33e1e3910506
add configurable list of tlds
author | carl |
---|---|
date | Thu, 27 May 2004 10:08:51 -0700 |
parents | 43a4f6b3e668 |
children | 4dfdf33f1db0 |
comparison
equal
deleted
inserted
replaced
27:43a4f6b3e668 | 28:33e1e3910506 |
---|---|
13 | 13 |
14 // object to record things we see in the body content | 14 // object to record things we see in the body content |
15 struct recorder | 15 struct recorder |
16 { | 16 { |
17 string_set *html_tags; // valid tags | 17 string_set *html_tags; // valid tags |
18 string_set *tlds; // valid tlds | |
18 string_set hosts; | 19 string_set hosts; |
19 int bad_html_tags; | 20 int bad_html_tags; |
20 int binary_tags; | 21 int binary_tags; |
21 recorder(string_set *html_tags_); | 22 recorder(string_set *html_tags_, string_set *tlds_); |
22 ~recorder(); | 23 ~recorder(); |
23 void empty(); | 24 void empty(); |
24 void new_url(char *host); | 25 void new_url(char *host); |
25 void new_tag(char *tag); | 26 void new_tag(char *tag); |
26 void binary(); | 27 void binary(); |
27 }; | 28 }; |
28 recorder::recorder(string_set *html_tags_) { | 29 recorder::recorder(string_set *html_tags_, string_set *tlds_) { |
29 html_tags = html_tags_; | 30 html_tags = html_tags_; |
31 tlds = tlds_; | |
30 bad_html_tags = 0; | 32 bad_html_tags = 0; |
31 binary_tags = 0; | 33 binary_tags = 0; |
32 } | 34 } |
33 recorder::~recorder() { | 35 recorder::~recorder() { |
34 empty(); | 36 empty(); |
35 } | 37 } |
36 void recorder::empty() { | 38 void recorder::empty() { |
37 bad_html_tags = 0; | 39 bad_html_tags = 0; |
40 binary_tags = 0; | |
38 discard(hosts); | 41 discard(hosts); |
39 } | 42 } |
40 void recorder::new_url(char *host) { | 43 void recorder::new_url(char *host) { |
41 register_string(hosts, host); | 44 register_string(hosts, host); |
42 } | 45 } |
45 } | 48 } |
46 void recorder::new_tag(char *tag) { | 49 void recorder::new_tag(char *tag) { |
47 string_set::iterator i = html_tags->find(tag); | 50 string_set::iterator i = html_tags->find(tag); |
48 if (i == html_tags->end()) { | 51 if (i == html_tags->end()) { |
49 bad_html_tags++; | 52 bad_html_tags++; |
50 if (debug_syslog && (bad_html_tags < 10)) { | 53 if (debug_syslog && (bad_html_tags < 10) && (binary_tags < 10)) { |
51 // only log the first 10 bad tags | 54 // only log the first 10 bad tags |
52 char buf[200]; | 55 char buf[200]; |
53 snprintf(buf, sizeof(buf), "bad html tag %s", tag); | 56 snprintf(buf, sizeof(buf), "bad html tag %s", tag); |
54 my_syslog(buf); | 57 my_syslog(buf); |
55 } | 58 } |
371 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfd | 374 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfd |
372 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfe | 375 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfe |
373 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xff | 376 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xff |
374 }; | 377 }; |
375 | 378 |
376 | |
377 char *tlds[] = { | |
378 ".com", | |
379 ".net", | |
380 ".org", | |
381 ".biz", | |
382 ".info", | |
383 NULL | |
384 }; | |
385 | 379 |
386 u_char hex_decode[256] = { | 380 u_char hex_decode[256] = { |
387 0, // 0x00 | 381 0, // 0x00 |
388 0, // 0x01 | 382 0, // 0x01 |
389 0, // 0x02 | 383 0, // 0x02 |
951 // host name recognizer | 945 // host name recognizer |
952 case h_end: { | 946 case h_end: { |
953 pending[--count] = '\0'; // null terminate host name by overwriting the terminator | 947 pending[--count] = '\0'; // null terminate host name by overwriting the terminator |
954 if (!strchr((const char *)pending, '@')) { | 948 if (!strchr((const char *)pending, '@')) { |
955 // not an email address or message id | 949 // not an email address or message id |
956 char *tld; | 950 char *p1 = strchr((const char *)pending, '.'); |
957 for (int i=0; (tld = tlds[i]); i++) { | 951 char *p2 = strrchr((const char *)pending, '.'); |
958 int n = strlen(tld); | 952 if (p1 && (p1 != p2)) { |
959 if (count > n) { | 953 // have two periods, so three components |
960 if (strncasecmp((const char *)(pending+count-n), tld, n) == 0) { | 954 for (int i=1; i<count; i++) pending[i] = tolower(pending[i]); |
961 memory->new_url((char*)pending); | 955 // is last component a tld? |
962 break; | 956 string_set::iterator i = memory->tlds->find(p2); |
963 } | 957 if (i != memory->tlds->end()) memory->new_url((char*)pending); |
964 } | |
965 } | 958 } |
966 } | 959 } |
967 st = h_init; | 960 st = h_init; |
968 } // fall thru | 961 } // fall thru |
969 | 962 |