StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO
text
Body
copied!<p>Here is code based on the CSV code from <a href="http://cm.bell-labs.com/cm/cs/tpop/index.html" rel="nofollow">The Practice of Programming</a> by Kernighan and Plauger that has been adapted to deal with your weird malformed CSV data. (It wasn't all that hard to do; I already had the main code working and packaged, so I just had to add the CSV output functions and to modify the <code>advquoted()</code> function to handle the weird format in this question.</p> <h3>csv2.h</h3> <pre><code>/* @(#)File: $RCSfile: csv2.h,v $ @(#)Version: $Revision: 2.1 $ @(#)Last changed: $Date: 2012/11/01 22:23:07 $ @(#)Purpose: Scanner for Comma Separated Variable (CSV) Data @(#)Author: J Leffler @(#)Origin: Kernighan & Pike, 'The Practice of Programming' */ /*TABSTOP=4*/ #ifndef CSV2_H #define CSV2_H #ifdef __cplusplus extern "C" { #endif #ifdef MAIN_PROGRAM #ifndef lint /* Prevent over-aggressive optimizers from eliminating ID string */ const char jlss_id_csv2_h[] = "@(#)$Id: csv2.h,v 2.1 2012/11/01 22:23:07 jleffler Exp $"; #endif /* lint */ #endif /* MAIN_PROGRAM */ #include <stdio.h> extern char *csvgetline(FILE *ifp); /* Read next input line */ extern char *csvgetfield(size_t n); /* Return field n */ extern size_t csvnfield(void); /* Return number of fields */ extern void csvreset(void); /* Release space used by CSV */ extern int csvputfield(FILE *ofp, const char *field); extern int csvputline(FILE *ofp, char **fields, int nfields); extern void csvseteol(const char *eol); #ifdef __cplusplus } #endif #endif /* CSV2_H */ </code></pre> <h3>csv2.c</h3> <pre><code>/* @(#)File: $RCSfile: csv2.c,v $ @(#)Version: $Revision: 2.1 $ @(#)Last changed: $Date: 2012/11/01 22:23:07 $ @(#)Purpose: Scanner for Comma Separated Variable (CSV) Data @(#)Modification: Deal with specific malformed CSV @(#)Author: J Leffler @(#)Origin: Kernighan & Pike, 'The Practice of Programming' */ /*TABSTOP=4*/ #ifndef lint /* Prevent over-aggressive optimizers from eliminating ID string */ const char jlss_id_csv2_c[] = "@(#)$Id: csv2.c,v 2.1 2012/11/01 22:23:07 jleffler Exp $"; #endif /* lint */ /* ** See RFC 4180 (http://www.ietf.org/rfc/rfc4180.txt). ** ** Specific malformed CSV - see SO 13183644 (http://stackoverflow.com/questions/13183644). ** Data contains malformed CSV fields like: OK,""this is a problem"",OK ** Two (but not three) field quotes at the start extract as "this is a problem" (with the quotes). */ #include "csv2.h" #include <stdlib.h> #include <string.h> enum { NOMEM = -2 }; static char *line = 0; /* Input line */ static char *sline = 0; /* Split line */ static size_t maxline = 0; /* Size of line[] and sline[] */ static char **field = 0; /* Field pointers */ static size_t maxfield = 0; /* Size of field[] */ static size_t nfield = 0; /* Number of fields */ static char fieldsep[]= ","; /* Field separator characters */ static char fieldquote = '"'; /* Quote character */ static char eolstr[8] = "\n"; void csvreset(void) { free(line); free(sline); free(field); line = 0; sline = 0; field = 0; maxline = maxfield = nfield = 0; } static int endofline(FILE *ifp, int c) { int eol = (c == '\r' || c == '\n'); if (c == '\r') { c = getc(ifp); if (c != '\n' && c != EOF) ungetc(c, ifp); } return(eol); } /* Modified to deal with specific malformed CSV */ static char *advquoted(char *p) { size_t i; size_t j; if (p[0] == fieldquote && (p[1] != *fieldsep && p[1] != fieldquote)) { /* Malformed CSV: ""some stuff"" --> "some stuff" */ /* Find "\"\"," or "\"\"\0" to mark end of field */ /* If we don't find it, drop through to 'regular' case */ char *eof = strstr(&p[2], "\"\""); if (eof != 0 && (eof[2] == *fieldsep || eof[2] == '\0')) { p[eof + 1 - p] = '\0'; return(eof + 2); } } for (i = j = 0; p[j] != '\0'; i++, j++) { if (p[j] == fieldquote && p[++j] != fieldquote) { size_t k = strcspn(p+j, fieldsep); memmove(p+i, p+j, k); // 1 -> i fixing transcription error i += k; j += k; break; } p[i] = p[j]; } p[i] = '\0'; return(p + j); } static int split(void) { char *p; char **newf; char *sepp; int sepc; nfield = 0; if (line[0] == '\0') return(0); strcpy(sline, line); p = sline; do { if (nfield >= maxfield) { maxfield *= 2; newf = (char **)realloc(field, maxfield * sizeof(field[0])); if (newf == 0) return NOMEM; field = newf; } if (*p == fieldquote) sepp = advquoted(++p); else sepp = p + strcspn(p, fieldsep); sepc = sepp[0]; sepp[0] = '\0'; field[nfield++] = p; p = sepp + 1; } while (sepc == ','); return(nfield); } char *csvgetline(FILE *ifp) { size_t i; int c; if (line == NULL) { /* Allocate on first call */ maxline = maxfield = 1; line = (char *)malloc(maxline); /*=C++=*/ sline = (char *)malloc(maxline); /*=C++-*/ field = (char **)malloc(maxfield*sizeof(field[0])); /*=C++=*/ if (line == NULL || sline == NULL || field == NULL) { csvreset(); return(NULL); /* out of memory */ } } for (i = 0; (c = getc(ifp)) != EOF && !endofline(ifp, c); i++) { if (i >= maxline - 1) { char *newl; char *news; maxline *= 2; newl = (char *)realloc(line, maxline); /*=C++=*/ news = (char *)realloc(sline, maxline); /*=C++-*/ if (newl == NULL || news == NULL) { csvreset(); return(NULL); /* out of memory */ } line = newl; sline = news; } line[i] = c; } line[i] = '\0'; if (split() == NOMEM) { csvreset(); return(NULL); } return((c == EOF && i == 0) ? NULL : line); } char *csvgetfield(size_t n) { if (n >= nfield) return(0); return(field[n]); } size_t csvnfield(void) { return(nfield); } int csvputfield(FILE *ofp, const char *ofield) { const char escapes[] = "\",\r\n"; if (strpbrk(ofield, escapes) != 0) { size_t len = strlen(ofield) + 2; const char *pos = ofield; while ((pos = strchr(pos, '"')) != 0) { len++; pos++; } char *space = malloc(len+1); if (space == 0) return EOF; char *cpy = space; pos = ofield; *cpy++ = '"'; char c; while ((c = *pos++) != '\0') { if (c == '"') *cpy++ = c; *cpy++ = c; } *cpy++ = '"'; *cpy = '\0'; int rc = fputs(space, ofp); free(space); return rc; } else return fputs(ofield, ofp); } int csvputline(FILE *ofp, char **fields, int nfields) { for (int i = 0; i < nfields; i++) { if (i > 0) putc(',', ofp); if (csvputfield(ofp, fields[i]) == EOF) return EOF; } return(fputs(eolstr, ofp)); } void csvseteol(const char *eol) { size_t nbytes = strlen(eol); if (nbytes >= sizeof(eolstr)) nbytes = sizeof(eolstr) - 1; memmove(eolstr, eol, nbytes); eolstr[nbytes] = '\0'; } #ifdef TEST int main(void) { char *in_line; while ((in_line = csvgetline(stdin)) != 0) { size_t n = csvnfield(); char *fields[n]; /* C99 VLA */ printf("line = '%s'\n", in_line); for (size_t i = 0; i < n; i++) { printf("field[%zu] = '%s'\n", i, csvgetfield(i)); printf("field[%zu] = [", i); csvputfield(stdout, csvgetfield(i)); fputs("]\n", stdout); fields[i] = csvgetfield(i); } printf("fields[0..%zu] = ", n-1); csvputline(stdout, fields, n); } return(0); } #endif /* TEST */ </code></pre> <p>Compile the code with <code>-DTEST</code> to create a program with the example <code>main()</code> function. You need a C99 compiler; the code in <code>main()</code> uses a VLA (variable length array). You could avoid that with dynamic memory allocation or with pessimistic (overkill) memory allocation (an array of a few thousand pointers isn't going to kill most systems these days, but few CSV files will have a few thousand fields per line).</p> <h3>Example Data</h3> <p>Based closely on the data in the question.</p> <pre><code>009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing" 123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky 503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone" 610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule" </code></pre> <h3>Example Output</h3> <pre><code>line = '009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"' field[0] = '009098' field[0] = [009098] field[1] = '0981098094' field[1] = [0981098094] field[2] = 'something' field[2] = [something] field[3] = 'something else' field[3] = [something else] field[4] = '"this one, well, is a problem"' field[4] = ["""this one, well, is a problem"""] field[5] = ' "another thing"' field[5] = [" ""another thing"""] fields[0..5] = 009098,0981098094,something,something else,"""this one, well, is a problem"""," ""another thing""" line = '123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky' field[0] = '123458' field[0] = [123458] field[1] = '1234561007' field[1] = [1234561007] field[2] = 'anything' field[2] = [anything] field[3] = 'nothing else' field[3] = [nothing else] field[4] = '"this one, well, is a problem"' field[4] = ["""this one, well, is a problem"""] field[5] = 'dohicky' field[5] = [dohicky] fields[0..5] = 123458,1234561007,anything,nothing else,"""this one, well, is a problem""",dohicky line = '503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"' field[0] = '503458' field[0] = [503458] field[1] = '1234598094' field[1] = [1234598094] field[2] = 'nothing' field[2] = [nothing] field[3] = 'everything else' field[3] = [everything else] field[4] = '"this one, well, it isn't a problem"' field[4] = ["""this one, well, it isn't a problem"""] field[5] = 'abelone' field[5] = [abelone] fields[0..5] = 503458,1234598094,nothing,everything else,"""this one, well, it isn't a problem""",abelone line = '610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"' field[0] = '610078' field[0] = [610078] field[1] = '1236100794' field[1] = [1236100794] field[2] = 'everything' field[2] = [everything] field[3] = 'anything else' field[3] = [anything else] field[4] = 'this "isn't a problem", he said.' field[4] = ["this ""isn't a problem"", he said."] field[5] = 'Orcas Rule' field[5] = [Orcas Rule] fields[0..5] = 610078,1236100794,everything,anything else,"this ""isn't a problem"", he said.",Orcas Rule </code></pre> <p>The fields are printed twice, once to test the field extraction, once to test the field printing. You'd simplify the output by removing the printing except for <code>csvputline()</code> to convert your file from malformed CSV to properly formed CSV.</p>

Querying!

Guidance

An individual column

Larger individual text columns get their own page to allow for proper reading.

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload