-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobots_txt.c
125 lines (108 loc) · 3.87 KB
/
robots_txt.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#include "robots_txt.h"
size_t write_callback(char* new_content, size_t size, size_t nmemb, void* userdata) {
char** content = (char**)userdata;
size_t length_content = strlen(*content);
*content = realloc(*content, length_content + size * nmemb);
memcpy(&((*content)[length_content]), new_content, size * nmemb);
return size * nmemb;
}
void get_robots_txt_urls(char* url, bool no_color, URLNode_t** list_urls_found) {
(void) list_urls_found;
CURL *curl;
CURLcode res;
char* content = malloc(1);
content[0] = 0;
curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void*) &content);
if(no_color) {
fprintf(stderr, "Fetching robots.txt: %s...\n", url);
} else {
fprintf(stderr, "%sFetching robots.txt: %s...%s\n", BLUE, url, RESET);
}
res = curl_easy_perform(curl);
if(res != CURLE_OK) {
if(no_color) {
fprintf(stderr, "Failed to fetch robots.txt: %s\n", url);
} else {
fprintf(stderr, "%sFailed to fetch robots.txt: %s%s\n", RED, url, RESET);
}
return ;
}
long status_code;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status_code);
if(status_code != 200) {
if(no_color) {
fprintf(stderr, "%s doesn't exist.\n", url);
} else {
fprintf(stderr, "%s%s doesn't exist.%s\n", RED, url, RESET);
}
return ;
}
curl_easy_cleanup(curl);
if(no_color) {
fprintf(stderr, "Finished fetching %s.\n", url);
} else {
fprintf(stderr, "%sFinished fetching %s.%s\n", GREEN, url, RESET);
}
char *line = strtok(content, "\n");
while(line != NULL) {
// remove comments
char* next_comment = strchr(line, '#');
if(next_comment != NULL) {
line[next_comment-line] = '\0';
}
// cleanup spaces before at the beginning of the line
while(*line == ' ') {
line = line+1;
}
CURLU* curl_url_handler = curl_url();
curl_url_set(curl_url_handler, CURLUPART_URL, url, 0);
if(strncmp(line, "Disallow:", 9) == 0) {
char* trimed_path = trim_spaces(line + 9);
if(strlen(trimed_path) == 0) {
free(trimed_path);
continue;
}
curl_url_set(curl_url_handler, CURLUPART_URL, trimed_path, 0);
} else if(strncmp(line, "Allow:", 6) == 0) {
char* trimed_path = trim_spaces(line + 6);
if(strlen(trimed_path) == 0) {
free(trimed_path);
continue;
}
curl_url_set(curl_url_handler, CURLUPART_URL, trimed_path, 0);
} else {
curl_url_cleanup(curl_url_handler);
line = strtok(NULL, "\n");
continue;
}
// '*' in path is a wildcard, but doesn't indicate anything,
// so we will have to cut the path before this wildcard
char* path;
curl_url_get(curl_url_handler, CURLUPART_PATH, &path, 0);
char* wildcard_char = strchr(path, '*');
if(wildcard_char) {
char* tmp = wildcard_char;
// find past '/'
while(*tmp != '/' && tmp > path) {
tmp--;
}
// cut the path here
*tmp = 0;
curl_url_set(curl_url_handler, CURLUPART_PATH, path, 0);
free(path);
}
char* url_res;
curl_url_get(curl_url_handler, CURLUPART_URL, &url_res, 0);
stack_url_push(list_urls_found, url_res);
curl_url_cleanup(curl_url_handler);
line = strtok(NULL, "\n");
}
if(no_color) {
fprintf(stderr, "Finished parsing %s.\n", url);
} else {
fprintf(stderr, "%sFinished parsing %s.%s\n", BLUE, url, RESET);
}
}