r - Is there any way to automatically choose correct read.csv parameters? -
i have number of csv files different file format - columns added (which don't need), header there or not, time format %y-%m-%d %h:%m:%s
or %y%m%d%h%m%s
. there way pre-analysis of csv file choose correct parameters read.csv
?
for ex., have following logics used read files:
# file 1 dataft <- read.csv("file1.csv", header = true, colclasses = c("factor", "factor", "factor", "factor", "integer", "factor", "integer", "factor", "factor", "factor", "integer", "factor", "factor")) dataft[,"ddate"] = as.date(dataft[,"ddate"],"%y-%m-%d") # file 2 datagae <- read.csv("file2.csv", header = false, colclasses = c("factor", "factor", "factor", "factor", "integer", "factor", "integer", "factor", "factor", "factor", "integer", "factor", "factor"), col.names = c("col1", "col2", "ddate", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13")) datagae[,"ddate"] = as.date(datagae[,"ddate"],"%y%m%d") # file 3 (with column, don't need - not sure how skip it, null doesn't help) datagae <- read.csv("file3.csv", header = false, colclasses = c("factor", "factor", "factor", "factor", "integer", "factor", "integer", "factor", "factor", "factor", "integer", "factor", "factor", null), col.names = c("col1", "col2", "ddate", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", "")) datagae[,"ddate"] = as.date(datagae[,"ddate"],"%y%m%d")
(all data frames merged once loaded)
upd. files samples (number of possible file formats limited (and known)!) -
# file 1 or,d,ddate,rdate,changes,class,price,fdate,company,number,minutes,added,source va1,va2,2014-05-24,,0,0,2124,2014-05-22 15:50:16,,,,2014-05-22 12:20:03,tp va1,va2,2014-05-26,,0,0,2124,2014-05-22 15:03:44,,,,2014-05-22 12:20:03,tp va1,va2,2014-06-05,,0,0,2124,2014-05-22 15:48:24,,,,2014-05-22 12:20:03,tp va1,va2,2014-06-09,,0,0,2124,2014-05-22 15:37:35,,,,2014-05-22 12:20:03,tp va1,va2,2014-06-16,,0,0,2124,2014-05-22 14:17:33,,,,2014-05-22 12:20:03,tp # file 2 va2,va4,20140722,,0,3,6164,20140521121156,u1,u141,140,20140521121156,ms va3,va5,20140701,,0,0,15176,20140521145035,s1,s1342,355,20140521145035,ms va3,va6,20140710,,0,0,6676,20140521105118,s1,s1602,105,20140521105118,ms va2,va7,20140729,,0,0,10023,20140521132150,u6,u100,230,20140521132150,ms va2,va5,20140527,,0,0,13209,20140521145005,s7,s115,355,20140521145005,ms # file 3 va8,va3,20140929,,0,0,14571,20140603163257,s1,s233,390,20140603163421,ms,4503623383908352 va9,va0,20140611,,0,0,13329,20140603171428,u6,u355,165,20140603171553,ms,4503639892688896 va2,va4,20140722,,0,3,6164,20140521121156,u1,u141,140,20140521121156,ms,4503659220041728 va3,bax,20140601,,0,0,14176,20140525101531,s1,s1430,250,20140525101608,ms,4503686600458240 va3,ren,20140602,,0,0,10174,20140531213527,s1,s1244,121,20140531213653,ms,4503703511891968 # file 4 or,added,key,source,price,d,av_s,type,number,company,class,changes,minutes,fdate,ddate,code va2,20140808t122044,va2:va9:20140808::0:0:14430:20140808122044,qe,14430,va9,2,319,6156,s1,0,0,90,20140808t122044,20140808t192500,b va2,20140808t122044,va2:va9:20140808::0:0:19180:20140808122044,qe,19180,va9,2,319,6182,s1,0,0,90,20140808t122044,20140808t222000,y va2,20140808t122044,va2:va9:20140808::0:1:14866:20140808122044,qe,14866,va9,1,319,41,s7,1,0,100,20140808t122044,20140808t203500,d va2,20140808t122045,va2:va9:20140808::0:1:35180:20140808122045,qe,35180,va9,2,319,6146,s1,1,0,90,20140808t122045,20140808t171000,c va2,20140808t122044,va2:va9:20140809::0:0:3180:20140808122043,qe,3180,va9,2,319,6186,s1,0,0,95,20140808t122043,20140809t232000,n # file 5 data,key "va1,va2,20140524,,0,0,5969,20140523134902,s7,s1147,140,20140523134902,m/t",4503632376496128 "va2,va3,20140711,,0,0,8824,20140601095714,s1,s6402,175,20140601095839,m/t",4503643113914368 "va1,va3,20140710,,0,0,11678,20140604085203,s1,s1430,250,20140604085329,m/t",4503666467799040 "va2,va1,20140724,,0,0,7109,20140523133835,s7,s793,130,20140523133835,m/t",4503679218483200 "va3,va1,20140925,,0,0,10592,20140604092548,s7,s109,395,20140604092714,m/t",4503694653521920
finally, came following solution:
filecsv <- "file.csv" conn <- file(filecsv, "rt") file <- readlines(conn, n = 1) # read first line fileformat <- null if (file[1]=="or,d,ddate,rdate,changes,class,price,fdate,company,number,minutes,added,source") { fileformat <- 1 data <- read.csv(filecsv, header = true, colclasses = c("factor", "factor", "factor", "factor", "integer", "factor", "integer", "factor", "factor", "factor", "integer", "factor", "factor")) data[,"ddate"] = as.date(dataft[,"ddate"],"%y-%m-%d") } else if (file[1]=="or,added,key,source,price,d,av_s,type,number,company,class,changes,minutes,fdate,ddate,code") { fileformat <- 2 # approach read file } else if (file[1]=="data,key") { fileformat <- 3 # third approach read file } else if (grepl("[a-z]{3},[a-z]{3},20\\d{6},(20\\d{6})?,\\d,\\d,\\d+,20\\d{12}\\.*", file[1])[1] == true && length(strsplit(file[1], ",")[[1]])==13) { fileformat <- 4 } else if (grepl("[a-z]{3},[a-z]{3},20\\d{6},(20\\d{6})?,\\d,\\d,\\d+,20\\d{12}\\.*", file[1])[1] == true && length(strsplit(file[1], ",")[[1]])==14) { fileformat <- 5 }
first line analyzed identify file content , read properly.
Comments
Post a Comment