comparison src/scanner.cpp @ 28:33e1e3910506

add configurable list of tlds
author carl
date Thu, 27 May 2004 10:08:51 -0700
parents 43a4f6b3e668
children 4dfdf33f1db0
comparison
equal deleted inserted replaced
27:43a4f6b3e668 28:33e1e3910506
13 13
14 // object to record things we see in the body content 14 // object to record things we see in the body content
15 struct recorder 15 struct recorder
16 { 16 {
17 string_set *html_tags; // valid tags 17 string_set *html_tags; // valid tags
18 string_set *tlds; // valid tlds
18 string_set hosts; 19 string_set hosts;
19 int bad_html_tags; 20 int bad_html_tags;
20 int binary_tags; 21 int binary_tags;
21 recorder(string_set *html_tags_); 22 recorder(string_set *html_tags_, string_set *tlds_);
22 ~recorder(); 23 ~recorder();
23 void empty(); 24 void empty();
24 void new_url(char *host); 25 void new_url(char *host);
25 void new_tag(char *tag); 26 void new_tag(char *tag);
26 void binary(); 27 void binary();
27 }; 28 };
28 recorder::recorder(string_set *html_tags_) { 29 recorder::recorder(string_set *html_tags_, string_set *tlds_) {
29 html_tags = html_tags_; 30 html_tags = html_tags_;
31 tlds = tlds_;
30 bad_html_tags = 0; 32 bad_html_tags = 0;
31 binary_tags = 0; 33 binary_tags = 0;
32 } 34 }
33 recorder::~recorder() { 35 recorder::~recorder() {
34 empty(); 36 empty();
35 } 37 }
36 void recorder::empty() { 38 void recorder::empty() {
37 bad_html_tags = 0; 39 bad_html_tags = 0;
40 binary_tags = 0;
38 discard(hosts); 41 discard(hosts);
39 } 42 }
40 void recorder::new_url(char *host) { 43 void recorder::new_url(char *host) {
41 register_string(hosts, host); 44 register_string(hosts, host);
42 } 45 }
45 } 48 }
46 void recorder::new_tag(char *tag) { 49 void recorder::new_tag(char *tag) {
47 string_set::iterator i = html_tags->find(tag); 50 string_set::iterator i = html_tags->find(tag);
48 if (i == html_tags->end()) { 51 if (i == html_tags->end()) {
49 bad_html_tags++; 52 bad_html_tags++;
50 if (debug_syslog && (bad_html_tags < 10)) { 53 if (debug_syslog && (bad_html_tags < 10) && (binary_tags < 10)) {
51 // only log the first 10 bad tags 54 // only log the first 10 bad tags
52 char buf[200]; 55 char buf[200];
53 snprintf(buf, sizeof(buf), "bad html tag %s", tag); 56 snprintf(buf, sizeof(buf), "bad html tag %s", tag);
54 my_syslog(buf); 57 my_syslog(buf);
55 } 58 }
371 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfd 374 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfd
372 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfe 375 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xfe
373 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xff 376 {h_init, h_end, t_init, t_bin, t_disc, u_init, u_init, u_init, u_reco, d_init, d_init, d_init, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_init, b_init, b_init, }, // 0xff
374 }; 377 };
375 378
376
377 char *tlds[] = {
378 ".com",
379 ".net",
380 ".org",
381 ".biz",
382 ".info",
383 NULL
384 };
385 379
386 u_char hex_decode[256] = { 380 u_char hex_decode[256] = {
387 0, // 0x00 381 0, // 0x00
388 0, // 0x01 382 0, // 0x01
389 0, // 0x02 383 0, // 0x02
951 // host name recognizer 945 // host name recognizer
952 case h_end: { 946 case h_end: {
953 pending[--count] = '\0'; // null terminate host name by overwriting the terminator 947 pending[--count] = '\0'; // null terminate host name by overwriting the terminator
954 if (!strchr((const char *)pending, '@')) { 948 if (!strchr((const char *)pending, '@')) {
955 // not an email address or message id 949 // not an email address or message id
956 char *tld; 950 char *p1 = strchr((const char *)pending, '.');
957 for (int i=0; (tld = tlds[i]); i++) { 951 char *p2 = strrchr((const char *)pending, '.');
958 int n = strlen(tld); 952 if (p1 && (p1 != p2)) {
959 if (count > n) { 953 // have two periods, so three components
960 if (strncasecmp((const char *)(pending+count-n), tld, n) == 0) { 954 for (int i=1; i<count; i++) pending[i] = tolower(pending[i]);
961 memory->new_url((char*)pending); 955 // is last component a tld?
962 break; 956 string_set::iterator i = memory->tlds->find(p2);
963 } 957 if (i != memory->tlds->end()) memory->new_url((char*)pending);
964 }
965 } 958 }
966 } 959 }
967 st = h_init; 960 st = h_init;
968 } // fall thru 961 } // fall thru
969 962