我有一个5.5 GB大小的CSV文件,里面有100多列。我只想从CSV文件中导入特定的列。有哪些可能的方法可以做到这一点?
我想把它导入到两个不同的表中。只有一个字段到一个表,其余字段到另一个表。
我应该使用复制命令在Postgresql或CSV类或SmartCSV类型的宝石为此目的?
问候,苏雷什。
如果我有5Gb的CSV,我最好不使用Rails导入它!但是,你可能有一个需要Rails的用例……
既然你说RAILS,我想你是在谈论一个web请求和ActiveRecord…
如果你不关心等待(挂起一个服务器进程的实例),你可以这样做:
在之前,请注意两点:1)使用临时表,如果出现错误,您不会扰乱您的dest表-当然,这是可选的。2)使用0选项首先截断dest表
控制器动作:def updateDB
remote_file = params[:remote_file] ##<ActionDispatch::Http::UploadedFile>
truncate = (params[:truncate]=='true') ? true : false
if remote_file
result = Model.csv2tempTable(remote_file.original_filename, remote_file.tempfile) if remote_file
if result[:result]
Model.updateFromTempTable(truncate)
flash[:notice] = 'sucess.'
else
flash[:error] = 'Errors: ' + result[:errors].join(" ==>")
end
else
flash[:error] = 'Error: no file given.'
end
redirect_to somewhere_else_path
end
模型方法:
# References:
# http://www.kadrmasconcepts.com/blog/2013/12/15/copy-millions-of-rows-to-postgresql-with-rails/
# http://stackoverflow.com/questions/14526489/using-copy-from-in-a-rails-app-on-heroku-with-the-postgresql-backend
# http://www.postgresql.org/docs/9.1/static/sql-copy.html
#
def self.csv2tempTable(uploaded_name, uploaded_file)
erros = []
begin
#read csv file
file = uploaded_file
Rails.logger.info "Creating temp table...n From: #{uploaded_name}n "
#init connection
conn = ActiveRecord::Base.connection
rc = conn.raw_connection
# remove columns created_at/updated_at
rc.exec "drop table IF EXISTS #{TEMP_TABLE}; "
rc.exec "create table #{TEMP_TABLE} (like #{self.table_name}); "
rc.exec "alter table #{TEMP_TABLE} drop column created_at, drop column updated_at;"
#copy it!
rc.exec("COPY #{TEMP_TABLE} FROM STDIN WITH CSV HEADER")
while !file.eof?
# Add row to copy data
l = file.readline
if l.encoding.name != 'UTF-8'
Rails.logger.info "line encoding is #{l.encoding.name}..."
# ENCODING:
# If the source string is already encoded in UTF-8, then just calling .encode('UTF-8') is a no-op,
# and no checks are run. However, converting it to UTF-16 first forces all the checks for invalid byte
# sequences to be run, and replacements are done as needed.
# Reference: http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8?rq=1
l = l.encode('UTF-16', 'UTF-8').encode('UTF-8', 'UTF-16')
end
Rails.logger.info "writing line with encoding #{l.encoding.name} => #{l[0..80]}"
rc.put_copy_data( l )
end
# We are done adding copy data
rc.put_copy_end
# Display any error messages
while res = rc.get_result
e_message = res.error_message
if e_message.present?
erros << "Erro executando SQL: n" + e_message
end
end
rescue StandardError => e
erros << "Error in csv2tempTable: n #{e} => #{e.to_yaml}"
end
if erros.present?
Rails.logger.error erros.join("*******************************n")
{ result: false, erros: erros }
else
{ result: true, erros: [] }
end
end
# copy from TEMP_TABLE into self.table_name
# If <truncate> = true, truncates self.table_name first
# If <truncate> = false, update lines from TEMP_TABLE into self.table_name
#
def self.updateFromTempTable(truncate)
erros = []
begin
Rails.logger.info "Refreshing table #{self.table_name}...n Truncate: #{truncate}n "
#init connection
conn = ActiveRecord::Base.connection
rc = conn.raw_connection
#
if truncate
rc.exec "TRUNCATE TABLE #{self.table_name}"
return false unless check_exec(rc)
rc.exec "INSERT INTO #{self.table_name} SELECT *, '#{DateTime.now}' as created_at, '#{DateTime.now}' as updated_at FROM #{TEMP_TABLE}"
return false unless check_exec(rc)
else
#remove lines from self.table_name that are present in temp
rc.exec "DELETE FROM #{self.table_name} WHERE id IN ( SELECT id FROM #{FARMACIAS_TEMP_TABLE} )"
return false unless check_exec(rc)
#copy lines from temp into self + includes timestamps
rc.exec "INSERT INTO #{self.table_name} SELECT *, '#{DateTime.now}' as created_at, '#{DateTime.now}' as updated_at FROM #{FARMACIAS_TEMP_TABLE};"
return false unless check_exec(rc)
end
rescue StandardError => e
Rails.logger.error "Error in updateFromTempTable: n #{e} => #{e.to_yaml}"
return false
end
true
end