Skip to content Skip to sidebar Skip to footer

Select Records Incrementally In Mysql And Save To Csv In Python

I need to query the database for some data analysis and I have more than 20 millions records. I have limited access to the DB and my query times out after 8 mins. So, I am trying t

Solution 1:

Your code should look like below. You can tune its performance by per_query variable

c = csv.writer(open("temp.csv","wb"))
offset=0
per_query =10000
while true:
    cur.execute("__the_query__ LIMIT %s OFFSET %s", (per_query, offset))

    rows= cur.fetchall()
    if len(rows) ==0:
        break #escape the loop at the endof data

    forrowin cur.fetchall():
        c.writerow(row)

    offset+= per_query

Solution 2:

Untested code but this should get you started...

SQL = """
SELECT a.user_id, b.last_name, b.first_name, 
    FLOOR(DATEDIFF(CURRENT_DATE(), c.birth_date) / 365) age,
    DATEDIFF(b.left_date, b.join_date) workDays
    FROM users a
    INNER JOIN users_signup b ON a.user_id a = b.user_id
    INNER JOIN users_personal c ON a.user_id a = c.user_id
    INNER JOIN
    (
        SELECT distinct d.a.user_id FROM users_signup d
        WHERE (user_id >=1 AND user_id <1000000)
        AND d.join_date >= '2013-01-01' and d.join_date < '2014-01-01'
    ) 
    AS t ON a.user_id = t.user_id
    OFFSET %s LIMIT %s 
    """

BATCH_SIZE = 100000withopen("temp.csv","wb") as f:
   writer = csv.writer(f)
   cursor = db_main.cursor()

   offset = 0
   limit = BATCH_SIZE


   whileTrue:
       cursor.execute(SQL, (offset, limit))
       for row in cursor:
           writer.writerow(row)
       else:
           # no more rows, we're donebreak
       offset += BATCH_SIZE    
cursor.close()

Solution 3:

Here is an example of implementation that might help you:

from contextlib import contextmanager
import MySQLdb
import csv

connection_args = {"host": "localhost", "port": 1234, "user": "user1", "passwd": "test123", "db": "mainDB"}

@contextmanagerdefget_cursor(**kwargs):
    ''' The contextmanager allow to automatically close
    the cursor.
    '''
    db = MySQLdb.connect(**kwargs)
    cursor = db.cursor()
    try:
        yield cursor
    finally:
        cursor.close()

# note the placeholders for the limits
query = """ SELECT a.user_id, b.last_name, b.first_name,
        FLOOR(DATEDIFF(CURRENT_DATE(), c.birth_date) / 365) age,
        DATEDIFF(b.left_date, b.join_date) workDays
    FROM users a
    INNER JOIN users_signup b ON a.user_id a = b.user_id
    INNER JOIN users_personal c ON a.user_id a = c.user_id
    INNER JOIN
    (
        SELECT distinct d.a.user_id FROM users_signup d
        WHERE (user_id >= 1 AND user_id < 1000000)
        AND d.join_date >= '2013-01-01' and d.join_date < '2014-01-01'
    ) AS t ON a.user_id = t.user_id OFFSET %s LIMIT %s """

csv_file = csv.writer(open("temp.csv","wb"))

# One million at the time
STEP = 1000000for step_nb in xrange(0, 20):
    with get_cursor(**connection_args) as cursor:
        cursor.execute(query, (step_nb * STEP, (step_nb + 1) * STEP))  # query the DBfor row in cursor:  # use the cursor instead of fetching everything in memory
            csv_file.writerow(row)

Edited: misunderstanding of what was the Batch (though it was on user_id)

Post a Comment for "Select Records Incrementally In Mysql And Save To Csv In Python"