Skip to content Skip to sidebar Skip to footer

Speed Up Insert To Sql Server From Csv File Without Using Bulk Insert Or Pandas To_sql

I want to put a Pandas dataframe as a whole in a table in a MS SQL Server database. BULK INSERT is not allowed for common users like myself. I am using pyodbc to connect to my data

Solution 1:

The bottleneck you face is that your code sends an INSERT statement for each row in the DataFrame. That is, for a sample data file

id;txt
1;alpha
2;bravo
3;charlie
4;delta
5;echo
6;foxtrot
7;golf

you would need seven (7) round-trips to the server to send the equivalent of

INSERTINTO MySchema.MyTable VALUES (1,'alpha')
INSERTINTO MySchema.MyTable VALUES (2,'bravo')
INSERTINTO MySchema.MyTable VALUES (3,'charlie')
...
INSERTINTO MySchema.MyTable VALUES (7,'golf')

You could speed that up significantly by using a Table Value Constructor to do the same thing in one round-trip:

INSERTINTO MySchema.MyTable VALUES (1,'alpha'),(2,'bravo'),(3,'charlie'), ... ,(7,'golf')

The following code does just that. When I tested it using a file with 5000 rows, running it with rows_per_batch=1000 (the maximum) was about 100 times faster than with rows_per_batch=1 (the equivalent of your current approach).

import numpy
import pandas as pd
import pyodbc
import time


classMyDfInsert:
    def__init__(self, cnxn, sql_stub, data_frame, rows_per_batch=1000):
        # NB: hard limit is 1000 for SQL Server table value constructor
        self._rows_per_batch = 1000if rows_per_batch > 1000else rows_per_batch

        self._cnxn = cnxn
        self._sql_stub = sql_stub
        self._num_columns = None
        self._row_placeholders = None
        self._num_rows_previous = None
        self._all_placeholders = None
        self._sql = None

        row_count = 0
        param_list = list()
        for df_row in data_frame.itertuples():
            param_list.append(tuple(df_row[1:]))  # omit zero-based row index
            row_count += 1if row_count >= self._rows_per_batch:
                self._send_insert(param_list)  # send a full batch
                row_count = 0
                param_list = list()
        self._send_insert(param_list)  # send any remaining rowsdef_send_insert(self, param_list):
        iflen(param_list) > 0:
            if self._num_columns isNone:
                # print('[DEBUG] (building items that depend on the number of columns ...)')# this only happens once
                self._num_columns = len(param_list[0])
                self._row_placeholders = ','.join(['?'for x inrange(self._num_columns)])
                # e.g. '?,?'
            num_rows = len(param_list)
            if num_rows != self._num_rows_previous:
                # print('[DEBUG] (building items that depend on the number of rows ...)')
                self._all_placeholders = '({})'.format('),('.join([self._row_placeholders for x inrange(num_rows)]))
                # e.g. '(?,?),(?,?),(?,?)'
                self._sql = f'{self._sql_stub} VALUES {self._all_placeholders}'
                self._num_rows_previous = num_rows
            params = [int(element) ifisinstance(element, numpy.int64) else element
                      for row_tup in param_list for element in row_tup]
            # print('[DEBUG]    sql: ' + repr(self._sql))# print('[DEBUG] params: ' + repr(params))
            crsr = self._cnxn.cursor()
            crsr.execute(self._sql, params)


if __name__ == '__main__':
    conn_str = (
        'DRIVER=ODBC Driver 11 for SQL Server;''SERVER=192.168.1.134,49242;''Trusted_Connection=yes;'
    )
    cnxn = pyodbc.connect(conn_str, autocommit=True)
    crsr = cnxn.cursor()
    crsr.execute("CREATE TABLE #tmp (id INT PRIMARY KEY, txt NVARCHAR(50))")

    df = pd.read_csv(r'C:\Users\Gord\Desktop\Query1.txt', sep=';', header=0)

    t0 = time.time()

    MyDfInsert(cnxn, "INSERT INTO #tmp (id, txt)", df, rows_per_batch=1000)

    print()
    print(f'Inserts completed in {time.time() - t0:.2f} seconds.')

    cnxn.close()

Post a Comment for "Speed Up Insert To Sql Server From Csv File Without Using Bulk Insert Or Pandas To_sql"