Read CSV Files with double quotes and comma into SQL DB

2012-6-12 13:04| 发布者: Test| 查看: 3012| 评论: 0|原作者: Brad Schulz|来自: SELECT Blog FROM Brad.Schulz

摘要: In this post, I’d like to talk about Comma-Separated Values in Files (i.e. CSV Files).I had a client with a need to read in a CSV file and do some subsequent processing with it. This was a very small ...

Complete source code

use TempDB /* Change to database appropriate for you */
go

if object_id('usp_ProcessCSVFile','P') is null
begin
exec('create procedure usp_ProcessCSVFile as select X=1')
end
go

/*
==============================================================================================

Procedure usp_ProcessCSVFile

Description/Purpose:
   Read CSV File data in T-SQL.

Parameters:
   @CSVFileName: Fully-qualified filename containing CSV data (visible by the server).
   @HeaderRowExists: Bit column specifying if Header Row exists in the file (Default=1).

Example:
   declare @t (CustomerID int,CompanyName varchar(40), Street varchar(40))
   insert @t
   exec usp_ProcessCSVFile 'C:\Temp\MyCustomerData.CSV'

For more details:
   http://bradsruminations.blogspot.com/2011/01/so-you-want-to-read-csv-files-huh.html

Jan14,2011-Brad Schulz
   Written
Feb14,2011-Brad Schulz
   Thanks to Leonid for catching the problem with empty/null columns. Changed the XPath
   for exracting column data from .value('(col/text())[n]') to .value('(col[n]/text())[1]').

==============================================================================================
*/
alter procedure usp_ProcessCSVFile
   @CSVFilename varchar(1000)
,@HeaderRowExists bit = 1
as

set nocount on

/* Acquire the file contents into a variable */
declare @SqlStmt nvarchar(max)
       ,@CSVContents varchar(max)
set @SqlStmt=N'
select @CSVContents=cast(FileContents as varchar(max))
from openrowset(bulk '''+@CSVFilename+''', single_blob) x(FileContents)'
exec sp_executesql @SqlStmt
                  ,N'@CSVContents varchar(max) output'
                  ,@CSVContents output

/* Remove any LineFeed characters that may be in the file */
set @CSVContents=replace(@CSVContents
                        ,char(10)
                        ,'')

/*
XML-Encode the contents... For example:
Converts & to &
Converts < to <
Converts > to >
This is done because so that these special characters will be UNencoded later
when the calling program shreds the XML using the .nodes() and .value() functions.
Important Note:
It will also encode the Carriage Returns (CHAR(13)) separating the rows to 
*/
set @CSVContents=(select [*]=@CSVContents
                  for xml path(''))

/* Convert all commas to Tab characters */
set @CSVContents=replace(@CSVContents
                        ,','
                        ,char(9))

/* Declare some variables */
declare @CurrentPos int                /* Current character pointer */
       ,@DoubleQuotePos1 int           /* Opening Double-Quote Position */
       ,@DoubleQuotePos2 int           /* Closing Double-Quote Position */
       ,@DoubleDoubleQuotePos int      /* Position of any '""' in the string */
       ,@StringColValue varchar(max)   /* Value of a String Column */
       ,@LenStringColValue int         /* and its length */

/* Initialize our current character position */
set @CurrentPos=0

/* Loop forever */
while 1=1
begin

/* First find a double-quote beyond our current position */
set @DoubleQuotePos1=charindex('"'
                                ,@CSVContents
                                ,@CurrentPos+1)

/* If there isn't one to be found, then we're done */
if @DoubleQuotePos1=0 break

/*
From that opening double-quote, find the position of another
double-quote (hopefully the closing one) and, at the same time,
see if there are any double double-quotes (i.e. '""')
*/
select @DoubleDoubleQuotePos=charindex('""'
                                        ,@CSVContents
                                        ,@DoubleQuotePos1+1)
        ,@DoubleQuotePos2=charindex('"'
                                   ,@CSVContents
                                   ,@DoubleQuotePos1+1)

/*
If the positions are the same, that just means the first
double-quote we found was just a double-double (i.e. '""').
So we'll continue looking forward until we find our first lone
double-quote, which will end up being our closing one.
*/
while @DoubleQuotePos2=@DoubleDoubleQuotePos
begin
    select @DoubleDoubleQuotePos=charindex('""'
                                          ,@CSVContents
                                          ,@DoubleQuotePos2+2)
          ,@DoubleQuotePos2=charindex('"'
                                     ,@CSVContents
                                     ,@DoubleQuotePos2+2)
end

/*
We found the closing double-quote position, so we can
now pull out string column value. Let's also save
the length of that string.
*/
select @StringColValue=substring(@CSVContents
                                  ,@DoubleQuotePos1
                                  ,@DoubleQuotePos2-@DoubleQuotePos1+1)
        ,@LenStringColValue=@DoubleQuotePos2-@DoubleQuotePos1+1

/*
Remember we changed all commas to Tab characters earlier?
Well, if there are any of those Tabs embedded in our
string column value, we will change them back to commas.
*/
set @StringColValue=replace(@StringColValue
                             ,char(9)
                             ,',')

/* Get rid of the opening and closing double-quotes */
set @StringColValue=substring(@StringColValue
                               ,2
                               ,len(@StringColValue)-2)

/* And change all double double-quotes to single double-quotes */
set @StringColValue=replace(@StringColValue
                             ,'""'
                             ,'"')

/*
And update our CSV file contents variable so that our
old string column value is substituted with the new value.
*/
set @CSVContents=stuff(@CSVContents
                        ,@DoubleQuotePos1
                        ,@LenStringColValue
                        ,@StringColValue)

/*
Finally, we set our new character position to point to our
closing double-quote. Remember that we have to account for our
substituted string column value being a different size than
it originally had been.
*/
set @CurrentPos=@DoubleQuotePos2
                 -@LenStringColValue   /* Subtract Old Length */
                 +len(@StringColValue) /* Add New Length */

end

/*
Put XML tags into our file contents variable.
Our Tab characters will tell us where to put the column tags, and our Carriage Return
characters (encoded earlier to ) will tell us where to put our row tags.
For example (assuming T is a Tab):
1TMoeTHoward ---> <row><col>1</col><col>Moe</col><col>Howard</col></row>
*/
set @CSVContents='<row><col>'
               + replace(replace(@CSVContents
                                ,''
                                ,'</col></row><row><col>')
                        ,char(9)
                        ,'</col><col>')
               + '</col></row>'

/* Get rid of any blank rows (most likely at the end of the file */
set @CSVContents=replace(@CSVContents,'<row><col></col></row>','')

/*
Pull out the First Row and count the Number of Columns.
If this first row is a Header Row, then remove it from the contents.
*/
declare @FirstRow varchar(max)
       ,@NumColumns int
set @CurrentPos=charindex('</row>',@CSVContents)
set @FirstRow=left(@CSVContents,@CurrentPos+5)
set @NumColumns=len(@FirstRow)
               -len(replace(@FirstRow,'<col>','<xx>'))
if @HeaderRowExists=1
begin
set @CSVContents=substring(@CSVContents
                            ,@CurrentPos+6
                            ,len(@CSVContents))
end

/*
Build the Select List with Column Names (Column1,Column2,etc up to
the appropriate number of columns) and their XML .value extractions.
For example, for 3 columns, @SelectList will contain:
   Column1=XMLNode.value('(col[1]/text())[1]','varchar(100)')
,Column2=XMLNode.value('(col[2]/text())[1]','varchar(100)')
,Column3=XMLNode.value('(col[3]/text())[1]','varchar(100)')
*/
declare @SelectList nvarchar(max)
;with ColumnCTE as
(
select ColNumber=cast(1 as int)
union all
select ColNumber=cast(ColNumber+1 as int)
from ColumnCTE
where ColNumber<@NumColumns
)
select @SelectList=coalesce(@SelectList+nchar(13)+N'      ,',N'')
                  +N'Column'+cast(ColNumber as nvarchar(10))
                  +N'=XMLNode.value(''(col['
                  +cast(ColNumber as nvarchar(10))
                  +N']/text())[1]'',''varchar(100)'')'
from ColumnCTE

/* Build the SQL Statement */
set @SqlStmt=N'
select '+@SelectList+N'
from (select XMLData=cast(@CSVContents as xml).query(''.'')) F_XML
cross apply XMLData.nodes(''/row'') F_Node(XMLNode)'

/* Execute it */
print @SQLStmt
exec sp_executeSQL @SqlStmt
                  ,N'@CSVContents varchar(max)'
                  ,@CSVContents

go

1 2 345 / 5 页下一页