/* webcrawler.c * * Kamil Kaminski * kkamin8@uic.edu * * * Web Crawler * This scans the root url inputted by the user and looks for all and * tags and crawls from there making sure that is chrooted in root * url * * ToDo: save crawled sites to disk * * */ #include #include #include #include #include #define URL_SZ 1024 struct stat_node { char url[URL_SZ]; char from_url[URL_SZ]; struct stat_node *next, *prev; }; typedef struct stat_node stat_node_t; struct crawl_stats { unsigned int visited_n, checked_n, broken_n; FILE *summary_fl, *visited_fl, *checked_fl, *broken_fl; unsigned header_size; char http_header[sizeof(char) * 1024 * 64]; /* 64KB */ stat_node_t *url_queue; stat_node_t *tail; stat_node_t *visited_list; char url_root[URL_SZ]; /* the domain getting crawled */ char domain[URL_SZ]; }; typedef struct crawl_stats crawl_stats_t; /* function prototypes */ int add_stat_node(stat_node_t **, stat_node_t **, const char *, const char *); int del_stat_node(stat_node_t **); size_t write_data(char *, size_t, size_t, void *); int curl_prepare(CURL **, crawl_stats_t *, int, char **); int curl_finish(CURL *, crawl_stats_t *); int curl_fetchpage(CURL *, crawl_stats_t *, const char *); /* globals */ static char last_redir_url[URL_SZ]; int add_stat_node(stat_node_t **head, stat_node_t **tail, const char *url, const char *from_url) { stat_node_t *node = (stat_node_t *) malloc(sizeof(stat_node_t)); memset(node, 0, sizeof(stat_node_t)); if (!node) { perror("failed to alloc memory for new node to the url queue"); return 1; } else { strncpy(node->url, url, URL_SZ); if (from_url) strncpy(node->from_url, from_url, URL_SZ); if (!*head) { *head = node; if (tail) *tail = node; /* tail bro */ } else { (*head)->prev = node; node->next = *head; (*head) = node; } } return 0; } int del_stat_node(stat_node_t **tail) { stat_node_t *node = *tail; *tail = (*tail)->prev; if (*tail) (*tail)->next = NULL; free(node); return 0; } int exists_stat_node(stat_node_t *list, const char *url) { stat_node_t *iter = list; while (iter) { if (strncmp(iter->url, url, URL_SZ) == 0) return 0; iter = iter->next; } return 1; } size_t write_data(char *ptr, size_t size, size_t nmemb, void *userdata) { /* return size * nmemb; */ return fwrite(ptr, size, nmemb, (FILE *) userdata); } size_t write_header(char *ptr, size_t size, size_t nmemb, void *userdata) { crawl_stats_t *p = (crawl_stats_t *) userdata; size_t sz = size * nmemb; memcpy(&(p->http_header[p->header_size]), ptr, sz); p->header_size += sz; p->http_header[p->header_size] = 0; return sz; } int curl_prepare(CURL **curl_handle, crawl_stats_t *page_stats, int argc, char **argv) { if (!(*curl_handle = curl_easy_init())) { perror("failed to create curl handle"); return 1; } page_stats->summary_fl = fopen(argc == 6 ? argv[2] : "summary", "w"); page_stats->visited_fl = fopen(argc == 6 ? argv[3] : "visited", "w"); page_stats->checked_fl = fopen(argc == 6 ? argv[4] : "checked", "w"); page_stats->broken_fl = fopen(argc == 6 ? argv[5] : "broken", "w"); if (!page_stats->summary_fl || !page_stats->visited_fl || !page_stats->checked_fl || !page_stats->broken_fl) { perror("failed to open stat files for writing"); return 1; } return 0; } int curl_finish(CURL *curl_handle, crawl_stats_t *page_stats) { /* always cleanup */ curl_easy_cleanup(curl_handle); /* write out the summary */ fprintf(page_stats->summary_fl, "Visited_pages: %u\n" "Checked_URLs: %u\n" "Broken_URLs: %u\n", page_stats->visited_n, page_stats->checked_n, page_stats->broken_n); fclose(page_stats->summary_fl); fclose(page_stats->visited_fl); fclose(page_stats->checked_fl); fclose(page_stats->broken_fl); /* dealloc url queue */ stat_node_t *iter = page_stats->visited_list; if (iter && iter->next) { while (iter) { free(iter->prev); iter = iter->prev; } } else if (iter) free(iter); return 0; } int curl_fetchpage(CURL *curl_handle, crawl_stats_t *page_stats, const char *urlname) { CURLcode curlret; curl_easy_setopt(curl_handle, CURLOPT_URL, urlname); /* actually not needed, since we set CURLOPT_WRITEHEADER explicitly */ //curl_easy_setopt(curl_handle, CURLOPT_HEADER, 1L); /* some servers don't like requests that are made without a user-agent * field, so we provide one */ curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); char err_buff[CURL_ERROR_SIZE]; curl_easy_setopt(curl_handle, CURLOPT_ERRORBUFFER, err_buff); /* send all data to this function */ curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_data); curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, write_header); /* by default passes NULL aka stdout */ FILE *data; if ((data = fopen("curl.data", "w"))) { curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *) data); curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, (void *) page_stats); } else { curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, stdout); curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, stdout); } if ((curlret = curl_easy_perform(curl_handle)) != CURLE_OK) { fprintf(stderr, "failed to fetch %s, %s\n", urlname, err_buff); page_stats->header_size = 0; /* kill current header */ fclose(data); return 1; } int ret = -1; fprintf(stdout, "fetched %s\n", urlname); int code; /* parse html header code */ char code_msg[256] = { 0 }; sscanf(page_stats->http_header, "HTTP/1.%*[01] %d %[^\r\n] ", &code, code_msg); fprintf(stdout, "\t%d %s, ", code, code_msg); /* printf("%s", page_stats->http_header); */ /* ask for the content-type */ char *ct; ret = curl_easy_getinfo(curl_handle, CURLINFO_CONTENT_TYPE, &ct); if ((CURLE_OK == ret) && ct) printf("content-type: %s\n", ct); if (code >= 400 && code <= 500) { page_stats->header_size = 0; /* kill current header */ fclose(data); return 2; } else if (code == 301 || code == 302) { char newurl[URL_SZ] = { 0 }; char *location = strstr(page_stats->http_header, "Location: "); sscanf(location, "Location: %[^\r\n] ", newurl); if (newurl[0] == '/') /* got relative redir, this sucks */ { char fullurl[URL_SZ] = { 0 }; snprintf(fullurl, URL_SZ, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &newurl[1] : &newurl[0]); strncpy(newurl, fullurl, URL_SZ); } printf("\tredirecting to: %s\n", strlen(newurl) > 120 ? "too long to display" : newurl); strncpy(last_redir_url, newurl, URL_SZ); page_stats->header_size = 0; /* kill current header */ fclose(data); return 3; } fclose(data); /* external regexp to parse out all (a | img) links */ ret = system("sed \'s/>/>/g\' curl.data | sed -r -n \'s/.*<(a|img)[^>]" "*(href|src)=[\"\'\']?([^\" >]*)[\"\'\']?.*$/\\3/p\' | awk '!/#/' > urls.txt"); /* we need to stay chrooted */ if (strstr(urlname, page_stats->url_root)) { FILE *urls_fl; if (!(urls_fl = fopen("urls.txt", "r"))) perror("failed to add urls to the queue, ignoring"); else { fseek(urls_fl, 0L, SEEK_END); size_t file_sz = ftell(urls_fl); fseek(urls_fl, 0L, SEEK_SET); /* this means that there we scanned links in webpage of type text/html */ if (file_sz || strstr(ct, "text/html")) { fprintf(page_stats->visited_fl, "%s\n", urlname); page_stats->visited_n++; } char url[URL_SZ] = { 0 }; while (fgets(url, URL_SZ, urls_fl)) { if (strchr(url, '@')) /* I assume these are e-mails */ continue; url[strlen(url)-1] = '\0'; /* kill newline */ if (strstr(url, page_stats->url_root) || (strstr(url, ".com") || strstr(url, ".org") || strstr(url, ".edu") || strstr(url, ".net") || strstr(url, ".info") || strstr(url, ".uk") || strstr(url, ".to"))) ; /* alright! */ else if (url[0] == '/' && url[1] != '~') { char url_absolute[URL_SZ] = { 0 }; sprintf(url_absolute, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &url[1] : url); strncpy(url, url_absolute, URL_SZ); } else if (strstr(url, "/~")) { char url_absolute[URL_SZ] = { 0 }; sprintf(url_absolute, "%s%s", page_stats->domain, url); strncpy(url, url_absolute, URL_SZ); } else if (strstr(url, "../")) { char url_absolute[URL_SZ] = { 0 }; char url_root_dup[URL_SZ] = { 0 }; strcpy(url_root_dup, page_stats->url_root); if (url_root_dup[strlen(url_root_dup)-1] == '/') url_root_dup[strlen(url_root_dup)-1] = '\0'; char *last = strrchr(url_root_dup, '/'); *last = 0; sprintf(url_absolute, "%s/%s", url_root_dup, &url[3]); strncpy(url, url_absolute, URL_SZ); } else if (strstr(url, "./")) { char url_absolute[URL_SZ] = { 0 }; sprintf(url_absolute, "%s%s", page_stats->url_root, &url[2]); strncpy(url, url_absolute, URL_SZ); } else if (strstr(url, "http:") && !strstr(url, "http://")) { char *colon = strchr(url, ':'); char url_absolute[URL_SZ] = { 0 }; sprintf(url_absolute, "%s%s", page_stats->url_root, &colon[1]); strncpy(url, url_absolute, URL_SZ); } else /* relative webpage, e.g. page.html */ { if (urlname[strlen(urlname)-1] == '/') { char url_absolute[URL_SZ] = { 0 }; sprintf(url_absolute, "%s%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? "" : "/", url); strncpy(url, url_absolute, URL_SZ); } else { /* e.g. domain.com/dir1/page1.html and user requests page2.html */ char url_absolute[URL_SZ] = { 0 }; char urlname_dup[URL_SZ] = { 0 }; strncpy(urlname_dup, urlname, URL_SZ); char *lastslash = strrchr(urlname_dup, '/'); lastslash[1] = '\0'; sprintf(url_absolute, "%s%s", urlname_dup, url); strncpy(url, url_absolute, URL_SZ); } } if (exists_stat_node(page_stats->url_queue, url) == 1) /* avoid dups */ { add_stat_node(&page_stats->url_queue, &page_stats->tail, url, urlname); printf("\tqueued for fetching: %s\n", url); } else printf("\talready queued up: %s\n", url); memset(url, 0, sizeof(url)); /* fgets is quite silly */ } fclose(urls_fl); } } else fprintf(stdout, "\tnot scanning %s for links\n", urlname); page_stats->header_size = 0; /* kill current header */ return 0; } int main(int argc, char **argv) { if (argc != 2 && argc != 6) { fprintf(stderr, "usage: %s \n", argv[0]); fprintf(stderr, "usage: %s \n", argv[0]); return 1; } CURL *curl_handle; crawl_stats_t page_stats; memset(&page_stats, 0, sizeof(page_stats)); if (curl_prepare(&curl_handle, &page_stats, argc, argv) != 0) return 1; /* call our curl routine to fetch a webpage */ int ret = -1; int wasredir = 0; /* make sure url ends with / */ char *url_arg = strndup(argv[1], URL_SZ - 1); size_t url_arg_sz = strlen(url_arg); if (url_arg[url_arg_sz - 1] != '/') { url_arg[url_arg_sz] = '/'; url_arg[url_arg_sz + 1] = '\0'; } /* figure out initial domain name and root url */ char initial_domain[URL_SZ] = { 0 }; char domain[URL_SZ] = { 0 }; char protocol[32] = { 0 }; /* sscanf(url_arg, "%*[(http:|ftp:)]//%[^/]", domain); */ sscanf(url_arg, "%[^:]://%[^/]", protocol, domain); snprintf(initial_domain, URL_SZ, "%s://%s", protocol, domain); printf("initial domain is: %s\n", initial_domain); strncpy(page_stats.domain, initial_domain, URL_SZ); strncpy(page_stats.url_root, url_arg, URL_SZ); do { /* loop through all redirs until we reach root url */ ret = curl_fetchpage(curl_handle, &page_stats, wasredir ? last_redir_url : url_arg); /* kickstart */ if (ret == 3) { wasredir = 1; strncpy(page_stats.url_root, last_redir_url, URL_SZ); /* deal with redir of root url */ } } while (ret == 3); if (ret == 1 || ret == 2) { fprintf(page_stats.broken_fl, "%s\n", wasredir ? last_redir_url : url_arg); fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg); page_stats.checked_n++; page_stats.broken_n++; } else if (ret == 0) { add_stat_node(&(page_stats.visited_list), NULL, wasredir ? last_redir_url : url_arg, NULL); /* mark url as visited */ fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg); page_stats.checked_n++; while (page_stats.tail) { char url[URL_SZ] = { 0 }; char from_url[URL_SZ] = { 0 }; strncpy(url, page_stats.tail->url, URL_SZ); strncpy(from_url, page_stats.tail->from_url, URL_SZ); del_stat_node(&(page_stats.tail)); /* dequeue */ if (exists_stat_node(page_stats.visited_list, url) == 0) { fprintf(stderr, "already visited: %s\n", url); continue; } ret = curl_fetchpage(curl_handle, &page_stats, url); add_stat_node(&(page_stats.visited_list), NULL, url, NULL); /* mark url as visited */ fprintf(page_stats.checked_fl, "%s\n", url); page_stats.checked_n++; switch (ret) { case 2: /* broken */ fprintf(page_stats.broken_fl, "%s from %s\n", url, from_url); page_stats.broken_n++; break; case 3: /* redir */ if (strlen(last_redir_url) == URL_SZ - 1) fprintf(stderr, "\tignoring very long redir, stupid host\n"); else add_stat_node(&(page_stats.url_queue), &(page_stats.tail), last_redir_url, NULL); break; case 4: fprintf(stderr, "what the heck just happened!\n"); break; default: break; } } } /* cleanup */ curl_finish(curl_handle, &page_stats); if (url_arg) free(url_arg); return 0; }