comparison src/scanner.cpp @ 8:dbe18921f741

integration work on url scanner
author carl
date Thu, 22 Apr 2004 11:25:45 -0700
parents 93ff6d1ef647
children 8c65411cd7ab
comparison
equal deleted inserted replaced
7:93ff6d1ef647 8:dbe18921f741
1 // normal stuff 1 static char* scanner_version="$Id$";
2 #include <stdio.h>
3 #include <stdlib.h>
4
5 // needed for std c++ collections
6 #include <set>
7 #include <map>
8 #include <list>
9
10 // for the dns resolver
11 #include <netinet/in.h>
12 #include <arpa/nameser.h>
13 #include <resolv.h>
14
15 // misc stuff needed here
16 #include <ctype.h>
17 #include <fstream>
18
19 static char* version="$Id$";
20 2
21 using namespace std; 3 using namespace std;
22
23 enum status {oksofar, // not rejected yet
24 white, // whitelisted by envelope from
25 black, // blacklisted by envelope from or to
26 reject}; // rejected by a dns list
27 4
28 enum state {// url decoder states 5 enum state {// url decoder states
29 u_init, 6 u_init,
30 u_http, 7 u_http,
31 u_sla, 8 u_sla,
839 0, // 0xfd 816 0, // 0xfd
840 0, // 0xfe 817 0, // 0xfe
841 0, // 0xff 818 0, // 0xff
842 }; 819 };
843 820
844 #define PENDING_LIMIT 1000 821 #define PENDING_LIMIT 100
845 struct fsa { 822 struct fsa {
846 u_char pending[PENDING_LIMIT]; 823 u_char pending[PENDING_LIMIT];
847 int count; 824 int count;
848 state st; 825 state st;
849 state init; 826 state init;
850 fsa* next; 827 fsa* next;
851 828 string_set *urls;
852 fsa(state init, fsa* next_); 829
830 fsa(state init, fsa* next_, string_set *urls_);
853 void push(u_char *buf, int len); 831 void push(u_char *buf, int len);
854 }; 832 };
855 833
856 fsa::fsa(state init_, fsa* next_) { 834 fsa::fsa(state init_, fsa *next_, string_set *urls_) {
857 count = 0; 835 count = 0;
858 st = init_; 836 st = init_;
859 init = init_; 837 init = init_;
860 next = next_; 838 next = next_;
839 urls = urls_;
861 } 840 }
862 841
863 void fsa::push(u_char *buf, int len) { 842 void fsa::push(u_char *buf, int len) {
864 for (int i=0; i<len; i++) { 843 for (int i=0; i<len; i++) {
865 u_char c = buf[i]; 844 u_char c = buf[i];
882 } 861 }
883 } break; 862 } break;
884 863
885 case u_reco: { 864 case u_reco: {
886 pending[count-1] = 0; 865 pending[count-1] = 0;
887 if (strncasecmp((const char *)pending, "http://", 7) == 0) fprintf(stdout, "%s\n", pending); 866 if (strncasecmp((const char *)pending, "http://", 7) == 0) {
867 urls->insert(strdup((const char *)pending+7));
868 }
888 } // fall thru 869 } // fall thru
889 870
890 case u_init: { 871 case u_init: {
891 count = 0; // discard all characters 872 count = 0; // discard all characters
892 } break; 873 } break;
977 } break; 958 } break;
978 } 959 }
979 } 960 }
980 } 961 }
981 962
982 963 struct url_scanner {
983 964 fsa *urls_parser;
984 //////////////////////////////////////////////// 965 fsa *html_parser;
985 // ask a dns question and get an A record answer 966 fsa *mime_parser;
986 // 967 fsa *b64_parser;
987 static unsigned long dns_interface(char *question); 968
988 static unsigned long dns_interface(char *question) { 969 url_scanner(string_set *urls);
989 u_char answer[NS_PACKETSZ]; 970 ~url_scanner();
990 int length = res_search(question, ns_c_in, ns_t_a, answer, sizeof(answer)); 971 void scan(u_char *buffer, size_t length);
991 if (length < 0) return oksofar; // error in getting answer 972 };
992 // parse the answer 973
993 ns_msg handle; 974 url_scanner::url_scanner(string_set *urls) {
994 ns_rr rr; 975 urls_parser = new fsa(u_init, NULL, urls);
995 if (ns_initparse(answer, length, &handle) != 0) return oksofar; 976 html_parser = new fsa(e_init, urls_parser, NULL);
996 int rrnum = 0; 977 mime_parser = new fsa(m_init, html_parser, NULL);
997 while (ns_parserr(&handle, ns_s_an, rrnum++, &rr) == 0) { 978 b64_parser = new fsa(b_init, mime_parser, NULL);
998 if (ns_rr_type(rr) == ns_t_a) {
999 unsigned long address;
1000 memcpy(&address, ns_rr_rdata(rr), sizeof(address));
1001 return reject;
1002 }
1003 }
1004 return 0;
1005 } 979 }
1006 980
1007 //////////////////////////////////////////////// 981 url_scanner::~url_scanner() {
1008 // check a single dnsbl - we don't try very hard, just 982 delete urls_parser;
1009 // using the default resolver retry settings. If we cannot 983 delete html_parser;
1010 // get an answer, we just accept the mail. The caller 984 delete mime_parser;
1011 // must ensure thread safety. 985 delete b64_parser;
1012 //
1013 static status check_single(int ip, char *suffix);
1014 static status check_single(int ip, char *suffix) {
1015 // make a dns question
1016 const u_char *src = (const u_char *)&ip;
1017 if (src[0] == 127) return oksofar; // don't do dns lookups on localhost
1018 char question[NS_MAXDNAME];
1019 snprintf(question, sizeof(question), "%u.%u.%u.%u.%s.", src[3], src[2], src[1], src[0], suffix);
1020 // ask the question, if we get an A record it implies a blacklisted ip address
1021 unsigned long ans = dns_interface(question);
1022 return (ans) ? reject : oksofar;
1023 } 986 }
1024 987
1025 988 void url_scanner::scan(u_char *buffer, size_t length) {
1026 //////////////////////////////////////////////// 989 b64_parser->push(buffer, length);
1027 // scan a file for URLs
1028 //
1029 static void scan_file(char *fn, fsa& parser);
1030 static void scan_file(char *fn, fsa& parser) {
1031 const int LINE_SIZE = 2000;
1032 char line[LINE_SIZE];
1033 ifstream is(fn);
1034 while (!is.eof()) {
1035 is.getline(line, LINE_SIZE-1);
1036 int n = strlen(line);
1037 line[n++] = '\n';
1038 parser.push((u_char*)line, n);
1039 }
1040 is.close();
1041 } 990 }
1042 991
1043
1044 int main(int argc, char**argv)
1045 {
1046 char *fn = argv[1];
1047 fsa *urls_parser = new fsa(u_init, NULL);
1048 fsa *html_parser = new fsa(e_init, urls_parser);
1049 fsa *mime_parser = new fsa(m_init, html_parser);
1050 fsa *b64_parser = new fsa(b_init, mime_parser);
1051 if (fn) scan_file(fn, *b64_parser);
1052 return 0;
1053 }