Friday, March 22, 2013

Using Python for file processing performance

When doing file processing shell scripting is nice and simple. However when joining two files larger than 2MB, shell scripting performance is poor due to file system IO.

Consider using Python, it uses memory so you can see huge performance improvement.

Example:

#!/usr/bin/env python
# Usage: run.py [PG FILE] [OIM FILE]

import sys, time, smtplib, socket
from datetime import datetime

pg_filename = sys.argv[1]
oim_filename = sys.argv[2]

oim_emp_no_list = []
pg_emp_no_list = []

oim_records = {}

oim_file = open(oim_filename, "rt")
for line in oim_file:
  items = line.split("|")

  if items[0] == "USR_LAST_NAME" : continue
  #if items[73].find("Disabled") == -1 : continue

  oim_record = {}
  oim_record["USR_EMP_NO"] = items[8]
  oim_records[oim_record["USR_EMP_NO"]] = oim_record

  emp_no = oim_record["USR_EMP_NO"]
  if emp_no not in oim_emp_no_list: oim_emp_no_list.append(emp_no)
oim_file.close()

#print "DEBUG> OIM Active OIM count - %s" % len(oim_emp_no_list)

pg_file = open(pg_filename, "rt")
for line in pg_file:
  items = line.split("|")
  pg_record = {}
  pg_record["EmployeeCode"] = items[19]

  emp_no = pg_record["EmployeeCode"]
  if emp_no not in pg_emp_no_list: pg_emp_no_list.append(emp_no)

  try:
    oim_record = oim_records[pg_record["EmployeeCode"]]
  except KeyError:
    print "%s" % pg_record["EmployeeCode"]
    #print "%s" % line
    continue
pg_file.close()