介绍如何使用ruby on rails导入5.5Gb的CSV文件到Postgresql中



我有一个5.5 GB大小的CSV文件,里面有100多列。我只想从CSV文件中导入特定的列。有哪些可能的方法可以做到这一点?

我想把它导入到两个不同的表中。只有一个字段到一个表,其余字段到另一个表。

我应该使用复制命令在Postgresql或CSV类或SmartCSV类型的宝石为此目的?

问候,苏雷什。

如果我有5Gb的CSV,我最好不使用Rails导入它!但是,你可能有一个需要Rails的用例……

既然你说RAILS,我想你是在谈论一个web请求和ActiveRecord…

如果你不关心等待(挂起一个服务器进程的实例),你可以这样做:

之前,请注意两点:1)使用临时表,如果出现错误,您不会扰乱您的dest表-当然,这是可选的。2)使用0选项首先截断dest表

控制器动作:

def updateDB
    remote_file = params[:remote_file]   ##<ActionDispatch::Http::UploadedFile>
    truncate = (params[:truncate]=='true') ? true : false
    if remote_file
        result = Model.csv2tempTable(remote_file.original_filename, remote_file.tempfile) if remote_file
        if result[:result]
            Model.updateFromTempTable(truncate)
            flash[:notice] = 'sucess.'
        else
            flash[:error] = 'Errors: ' + result[:errors].join(" ==>")
        end
    else
        flash[:error] = 'Error: no file given.'
    end
    redirect_to somewhere_else_path
end

模型方法:

# References:
# http://www.kadrmasconcepts.com/blog/2013/12/15/copy-millions-of-rows-to-postgresql-with-rails/
# http://stackoverflow.com/questions/14526489/using-copy-from-in-a-rails-app-on-heroku-with-the-postgresql-backend
# http://www.postgresql.org/docs/9.1/static/sql-copy.html
#
def self.csv2tempTable(uploaded_name, uploaded_file)
    erros = []
    begin
        #read csv file
        file = uploaded_file
        Rails.logger.info "Creating temp table...n From: #{uploaded_name}n "
        #init connection
        conn = ActiveRecord::Base.connection
        rc = conn.raw_connection
        # remove columns created_at/updated_at 
        rc.exec "drop   table IF EXISTS #{TEMP_TABLE}; "
        rc.exec "create table #{TEMP_TABLE} (like #{self.table_name}); "
        rc.exec "alter table #{TEMP_TABLE} drop column created_at, drop column updated_at;"
        #copy it!
        rc.exec("COPY #{TEMP_TABLE} FROM STDIN  WITH CSV HEADER")
        while !file.eof?
          # Add row to copy data
          l = file.readline
          if l.encoding.name != 'UTF-8'
              Rails.logger.info "line encoding is #{l.encoding.name}..."
              # ENCODING: 
              # If the source string is already encoded in UTF-8, then just calling .encode('UTF-8') is a no-op,
              # and no checks are run. However, converting it to UTF-16 first forces all the checks for invalid byte
              # sequences to be run, and replacements are done as needed.
              # Reference: http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8?rq=1
              l = l.encode('UTF-16', 'UTF-8').encode('UTF-8', 'UTF-16')
          end
          Rails.logger.info "writing line with encoding #{l.encoding.name} => #{l[0..80]}"
          rc.put_copy_data(  l  )
        end
        # We are done adding copy data
        rc.put_copy_end
        # Display any error messages
        while res = rc.get_result
          e_message = res.error_message
          if e_message.present?
            erros << "Erro executando SQL: n" + e_message
          end
        end
    rescue StandardError => e
        erros << "Error in csv2tempTable: n #{e} => #{e.to_yaml}"
    end
    if erros.present?
        Rails.logger.error erros.join("*******************************n")
        { result: false, erros: erros }
    else
        { result: true, erros: [] }
    end
end
# copy from TEMP_TABLE into self.table_name
# If <truncate> = true, truncates self.table_name first
# If <truncate> = false, update lines from TEMP_TABLE into self.table_name
#
def self.updateFromTempTable(truncate)
    erros = []
    begin
        Rails.logger.info "Refreshing table #{self.table_name}...n Truncate: #{truncate}n "
        #init connection
        conn = ActiveRecord::Base.connection
        rc = conn.raw_connection
        #
        if truncate
            rc.exec "TRUNCATE TABLE #{self.table_name}"
            return false unless check_exec(rc)
            rc.exec "INSERT INTO #{self.table_name} SELECT *, '#{DateTime.now}' as created_at, '#{DateTime.now}' as updated_at FROM #{TEMP_TABLE}"
            return false unless check_exec(rc)
        else
            #remove lines from self.table_name that are present in temp
            rc.exec "DELETE FROM #{self.table_name} WHERE id IN  ( SELECT id FROM #{FARMACIAS_TEMP_TABLE} )"
            return false unless check_exec(rc)
            #copy lines from temp into self  + includes timestamps
            rc.exec "INSERT INTO #{self.table_name} SELECT *, '#{DateTime.now}' as created_at, '#{DateTime.now}' as updated_at FROM #{FARMACIAS_TEMP_TABLE};"
            return false unless check_exec(rc)
        end
    rescue StandardError => e
        Rails.logger.error  "Error in updateFromTempTable: n #{e} => #{e.to_yaml}"
        return false
    end
    true
end

最新更新