/* string_tokenizer.c * * Notes: Seems like strsep and linked list is a way to go, but then original * string is still destroyed with strsep. So the idea is to work on a copy * of a string and let strsep to its job. One pitfall of a linked list * is that accessing data is not as flexible as with arrays * */ #include #include #include #include void str2dArrTest(void); void strTokenizerTestPrint(char *, const char *); size_t strStrtokNumOfTokens(const char *, const char *); char **strTokenize(const char *, const char *, const size_t); void strTokenizeFree(char **); void strTokenizeAndPrint(const char *, const char *, const size_t); extern inline int isDelim(const char, const char *); size_t numOfTokens(const char *, const char *); #ifdef _WIN32 char *strsep(char**, const char*); #endif void areNumOfTokensFuncConsistent(const char *, const char *); void testTokenizer(void); void strsepTest(void); struct tokenNode { const char *tok; struct tokenNode *next; }; /* this function assumes that a copy of a str buffer has already been made */ struct tokenNode *strTokenizeLinkedList(char *str, const char *delim) { if (!str || !delim) return NULL; struct tokenNode *head = (struct tokenNode *) malloc(sizeof(struct tokenNode)); if (!head ) { fprintf(stderr, "strTokenizeLinkedList failed, %s:%d\n", __FILE__, __LINE__); perror("malloc"); exit(-1); } char *tok; struct tokenNode *newNode; while ((tok = strsep(&str, delim)) != NULL) { newNode = (struct tokenNode *) malloc(sizeof(struct tokenNode)); newNode->tok = tok; newNode->next = head; head = newNode; } return head; } #ifdef _WIN32 /* this function is insane in a good way */ char *strsep(char **stringp, const char *delim) { char *result; if ((stringp == NULL) || (*stringp == NULL)) return NULL; result = *stringp; while (**stringp && !strchr(delim, **stringp)) ++*stringp; if (**stringp) *(*stringp)++ = '\0'; else *stringp = NULL; return result; } #endif void strsepTest(void) { char *buffer, *words[100], *aPtr; int count = 0, i; buffer = strdup("The quick brown fox jumps over the lazy dog"); if (!buffer) return; while ((aPtr = strsep(&buffer, " ")) && count < 100) words[count++] = aPtr; for (i = 0; i < count; i++) printf("%s\n", words[i]); } void str2dArrTest(void) { char *str_arr[200]; *str_arr = (char *) malloc(sizeof(char *) * 200); int i; for (i = 0; i < 200; i++) str_arr[i] = (char *) malloc(sizeof(char) * 31); strcpy(str_arr[0], "Hello"); strcpy(str_arr[1], ", world!"); printf("%s", str_arr[0]); printf("%s", str_arr[1]); } /* warning, str will get modified */ void strTokenizerTestPrint(char *str, const char *delim) { char *pch; printf("splitting string \"%s\" into tokens: \n", str); pch = strtok(str, delim); while (pch != NULL) { printf("%s ", pch); pch = strtok(NULL, delim); } } size_t strStrtokNumOfTokens(const char *str, const char *delim) { if (!str || !delim) return 0; /* make a copy of a string since strtok modifies it */ char *strcopy = (char *) malloc(strlen(str) + 1); if (!strcopy) { fprintf(stderr, "strStrtokNumOfTokens failed, %s:%d\n", __FILE__, __LINE__); perror("malloc"); exit(-1); } strcpy(strcopy, str); char *pch; size_t ret; pch = strtok(strcopy, delim); for (ret = 0; pch != NULL; ret++) pch = strtok(NULL, delim); free(strcopy); return ret; } /* returns tokenized array of strings / char pointers where the last token * in the array points to NULL address, this function fills the array of strings * with a maximum number of maxTokenSize characters if a token exceeds that limit */ char **strTokenize(const char *str, const char *delim, const size_t maxTokenSize) { if (maxTokenSize < 1 || maxTokenSize > 100) /* keep it sane */ return NULL; size_t tokensNum; if ((tokensNum = strStrtokNumOfTokens(str, delim)) == 0) return NULL; /* make a copy of a string since strtok modifies it */ char *strcopy = (char *) malloc(strlen(str) + 1); if (!strcopy) { fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); perror("malloc"); exit(-1); } strcpy(strcopy, str); /* char *tokensArr[tokensNum + 1]; */ char **tokensArr; size_t sizeOfTokenPtrArr = sizeof(char *) * (tokensNum + 1); tokensArr = (char **) malloc(sizeOfTokenPtrArr); if (!tokensArr) { fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); perror("malloc"); exit(-1); } memset(tokensArr, 0, sizeOfTokenPtrArr); int i; char *pch; for (i = 0; i < tokensNum; i++) { tokensArr[i] = (char *) malloc(sizeof(char) * (maxTokenSize + 1)); if (!(tokensArr[i])) { fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); perror("malloc"); exit(-1); } } size_t len; pch = strtok(strcopy, delim); for (i = 0; pch != NULL; i++) { len = strlen(pch); if (len <= maxTokenSize) strcpy(tokensArr[i], pch); else { strncpy(tokensArr[i], pch, maxTokenSize); tokensArr[i][maxTokenSize] = '\0'; } pch = strtok(NULL, delim); } return tokensArr; } void strTokenizeFree(char **tokenArr) { if (!tokenArr) return; int i; for (i = 0; tokenArr[i] != NULL; i++) free(tokenArr[i]); } void strTokenizeAndPrint(const char *strings, const char *delim, const size_t maxTokenLen) { char **tokenized; if ((tokenized = strTokenize(strings, delim, maxTokenLen)) == NULL) { fprintf(stderr, "strTokenizeAndPrint failed\n"); exit(-1); } int i; for (i = 0; tokenized[i] != NULL; i++) printf("token %d: \"%s\"\n", i+1, tokenized[i]); strTokenizeFree(tokenized); } inline int isDelim(const char c, const char *delim) { if (!delim) return 0; size_t delimLen = strlen(delim); char d; int i; for (i = 0; i < delimLen; i++) { d = delim[i]; if (c == d) return 1; } return 0; } size_t numOfTokens(const char *str, const char *delim) { if (!str || !delim) return 0; else if (str[0] == '\0') return 0; size_t numOfTokens = 0; int delimLast = 0; int delimCurr = 0; int i; delimLast = isDelim(str[0], delim); for (i = 1; str[i] != '\0'; i++) { delimCurr = isDelim(str[i], delim); if (!delimLast && delimCurr) numOfTokens++; delimLast = delimCurr; } if (!delimLast) numOfTokens++; return numOfTokens; } /* test */ void areNumOfTokensFuncConsistent(const char *str, const char *delim) { const size_t x = strStrtokNumOfTokens(str, delim); const size_t y = numOfTokens(str, delim); if (x == y) printf("PASS: numOfTokens and strStrtokNumOfTokens are consistent!\n"); else fprintf(stderr, "FAIL: numOfTokens and strStrtokNumOfTokens are inconsistent!\n" "numOfTokens found %lu tokens, strStrtokNumOfTokens found " "%lu tokens\n", y, x); } void testTokenizer(void) { const char *strings = "You've reached the website for Arch Linux" ", a lightweight and flexible Linux® distribution that tries to Keep It Simple." "Currently we have official packages optimized for the i686 and x86-64 architectures." "We complement our official package sets with a community-operated package " "repository that grows in size and quality each and every day."; const char *delim = " ,."; strTokenizeAndPrint(strings, delim, 31); areNumOfTokensFuncConsistent(strings, delim); } int main(int argc, char **argv) { #if 0 strTokenizeAndPrint("v 1//2", " /", 21); printf("dirname is: \"%s\", basename is: \"%s\"\n", dirname(argv[0]), basename(argv[0])); #endif char str[] = "oh noes I will get modified by strsep, I'm so scared"; struct tokenNode *head = strTokenizeLinkedList(str, " "); int i; for (i = 0; head->tok != NULL; i++) { printf("token: %s\n", head->tok); head = head->next; } return 0; }