Speed Up Insert To Sql Server From Csv File Without Using Bulk Insert Or Pandas To_sql
I want to put a Pandas dataframe as a whole in a table in a MS SQL Server database. BULK INSERT is not allowed for common users like myself. I am using pyodbc to connect to my data
Solution 1:
The bottleneck you face is that your code sends an INSERT statement for each row in the DataFrame. That is, for a sample data file
id;txt
1;alpha
2;bravo
3;charlie
4;delta
5;echo
6;foxtrot
7;golf
you would need seven (7) round-trips to the server to send the equivalent of
INSERTINTO MySchema.MyTable VALUES (1,'alpha')
INSERTINTO MySchema.MyTable VALUES (2,'bravo')
INSERTINTO MySchema.MyTable VALUES (3,'charlie')
...
INSERTINTO MySchema.MyTable VALUES (7,'golf')
You could speed that up significantly by using a Table Value Constructor to do the same thing in one round-trip:
INSERTINTO MySchema.MyTable VALUES (1,'alpha'),(2,'bravo'),(3,'charlie'), ... ,(7,'golf')
The following code does just that. When I tested it using a file with 5000 rows, running it with rows_per_batch=1000
(the maximum) was about 100 times faster than with rows_per_batch=1
(the equivalent of your current approach).
import numpy
import pandas as pd
import pyodbc
import time
classMyDfInsert:
def__init__(self, cnxn, sql_stub, data_frame, rows_per_batch=1000):
# NB: hard limit is 1000 for SQL Server table value constructor
self._rows_per_batch = 1000if rows_per_batch > 1000else rows_per_batch
self._cnxn = cnxn
self._sql_stub = sql_stub
self._num_columns = None
self._row_placeholders = None
self._num_rows_previous = None
self._all_placeholders = None
self._sql = None
row_count = 0
param_list = list()
for df_row in data_frame.itertuples():
param_list.append(tuple(df_row[1:])) # omit zero-based row index
row_count += 1if row_count >= self._rows_per_batch:
self._send_insert(param_list) # send a full batch
row_count = 0
param_list = list()
self._send_insert(param_list) # send any remaining rowsdef_send_insert(self, param_list):
iflen(param_list) > 0:
if self._num_columns isNone:
# print('[DEBUG] (building items that depend on the number of columns ...)')# this only happens once
self._num_columns = len(param_list[0])
self._row_placeholders = ','.join(['?'for x inrange(self._num_columns)])
# e.g. '?,?'
num_rows = len(param_list)
if num_rows != self._num_rows_previous:
# print('[DEBUG] (building items that depend on the number of rows ...)')
self._all_placeholders = '({})'.format('),('.join([self._row_placeholders for x inrange(num_rows)]))
# e.g. '(?,?),(?,?),(?,?)'
self._sql = f'{self._sql_stub} VALUES {self._all_placeholders}'
self._num_rows_previous = num_rows
params = [int(element) ifisinstance(element, numpy.int64) else element
for row_tup in param_list for element in row_tup]
# print('[DEBUG] sql: ' + repr(self._sql))# print('[DEBUG] params: ' + repr(params))
crsr = self._cnxn.cursor()
crsr.execute(self._sql, params)
if __name__ == '__main__':
conn_str = (
'DRIVER=ODBC Driver 11 for SQL Server;''SERVER=192.168.1.134,49242;''Trusted_Connection=yes;'
)
cnxn = pyodbc.connect(conn_str, autocommit=True)
crsr = cnxn.cursor()
crsr.execute("CREATE TABLE #tmp (id INT PRIMARY KEY, txt NVARCHAR(50))")
df = pd.read_csv(r'C:\Users\Gord\Desktop\Query1.txt', sep=';', header=0)
t0 = time.time()
MyDfInsert(cnxn, "INSERT INTO #tmp (id, txt)", df, rows_per_batch=1000)
print()
print(f'Inserts completed in {time.time() - t0:.2f} seconds.')
cnxn.close()
Post a Comment for "Speed Up Insert To Sql Server From Csv File Without Using Bulk Insert Or Pandas To_sql"