diff options
Diffstat (limited to 'multi-dim-dict.py')
-rw-r--r-- | multi-dim-dict.py | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/multi-dim-dict.py b/multi-dim-dict.py new file mode 100644 index 0000000..6b9e08f --- /dev/null +++ b/multi-dim-dict.py @@ -0,0 +1,42 @@ +import sys +import re + +# given following csv input, sum up the log size (last field) per each exchange per day + +csv_input = """date,process,host,log,bytes +20140206,cme_trader_2,cme0001,0345-cme_trader_2.log.gz,15400000 +20140206,phlx_trader_1,phlx0001,0651-phlx_trader_1.log.gz,14100000 +20140206,phlx_trader_2,phlx0001,0645-phlx_trader_2.log.gz,13800000 +20140207,cme_trader_2,cme0001,0345-cme_trader_2.log.gz,15800000 +20140207,cme_trader_3,cme0001,0345-cme_trader_3.log.gz,14200000 +20140207,phlx_trader_1,phlx0001,0651-phlx_trader_1.log.gz,24100000""" + +# you need to access and update the numsize of each exch +# you want levearge multi-dim dict for auto/mapping each line of csv to avoid keeping csv like structure in memory +header = '' +exchange_logs = {} +lines = csv_input.splitlines() +for line in lines: + l = line.split(',') + log_exch = '' + try: + log_exch = re.search(r"^(.*)_trader_\d$", l[1]).group(1) + except AttributeError: + header = line + continue + #print('csv line is: ', l) + log_date = l[0] + log_size = l[4] + + if log_date not in exchange_logs.keys(): + exchange_logs[log_date] = {} + + if log_exch in exchange_logs[log_date].keys(): + exchange_logs[log_date][log_exch] += int(log_size) # increment already present log size + else: + exchange_logs[log_date][log_exch] = int(log_size) # inital assignment + +print(header) +for k, v in sorted(exchange_logs.items()): + for k2, v2 in sorted(v.items()): + print("{0},{1},{2}".format(k, k2, v2))
\ No newline at end of file |