Python, Import csv to Bigquery Load job failed - Error 404

275 views
Skip to first unread message

Vikram Singh Saggu

unread,
Jul 3, 2014, 6:05:52 AM7/3/14
to gce-dis...@googlegroups.com
Hi,

Python script to load csv data to bigquery failed. Error 404

{'status': '400', 'alternate-protocol': '443:quic', 'content-length': '39', 'server': 'HTTP Upload Server Built on Jun 12 2014 14:56:53 (1402610213)', 'date': 'Thu, 03 Jul 2014 09:00:57 GMT', 'content-type': 'text/html; charset=UTF-8'}


Below is the script :-

from apiclient.discovery import build
from apiclient.errors import HttpError

from oauth2client.client import AccessTokenRefreshError
from oauth2client.client import OAuth2WebServerFlow
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import run

import argparse
import httplib2
import os
import sys

from apiclient import discovery
from oauth2client import file
from oauth2client import client
from oauth2client import tools

# Parser for command-line arguments.
parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    parents=[tools.argparser])


# CLIENT_SECRETS is name of a file containing the OAuth 2.0 information for this
# application, including client_id and client_secret. You can see the Client ID
# and Client secret on the APIs page in the Cloud Console:
CLIENT_SECRETS = os.path.join(os.path.dirname(__file__), 'client_secrets.json')

# Set up a Flow object to be used for authentication.
# Add one or more of the following scopes. PLEASE ONLY ADD THE SCOPES YOU
# NEED. For more information on using scopes please see
FLOW = client.flow_from_clientsecrets(CLIENT_SECRETS,
  scope=[
    ],
    message=tools.message_if_missing(CLIENT_SECRETS))

def main(argv):
  # Parse the command-line flags.
  flags = parser.parse_args(argv[1:])

  # If the credentials don't exist or are invalid run through the native client
  # flow. The Storage object will ensure that if successful the good
  # credentials will get written back to the file.
  storage = file.Storage('sample.dat')
  credentials = storage.get()
  if credentials is None or credentials.invalid:
    credentials = tools.run_flow(FLOW, storage, flags)

  # Create an httplib2.Http object to handle our HTTP requests and authorize it
  # with our good Credentials.
  http = httplib2.Http()
  http = credentials.authorize(http)

  # Construct the service object for the interacting with the BigQuery API.
  service = discovery.build('bigquery', 'v2', http=http)

  projectId = "synthetic-trail-459"
  datasetId = "GIE"
  tableId = "csbq"

  newSchemaFile = "schema.json"
  schema = open(newSchemaFile, 'r')

  # Create the body of the request, separated by a boundary of xxx
  newresource = ('--xxx\n' +
            'Content-Type: application/json; charset=UTF-8\n' + '\n' +
            '{\n' +
            '   "configuration": {\n' +
            '     "load": {\n' +
            '       "schema": {\n'
            '         "fields": ' + schema.read() + '\n' +
            '      },\n' +
            '      "destinationTable": {\n' +
            '        "projectId": "' + projectId + '",\n' +
            '        "datasetId": "' + datasetId + '",\n' +
            '        "tableId": "' + tableId + '"\n' +
            '      }\n' +
            '    }\n' +
            '  }\n' +
            '}\n' +
            '--xxx\n' +
            'Content-Type: application/octet-stream\n' +
            '\n')
  newDataFile = "chr.csv"

  # Append data from the specified file to the request body
  f = open(newDataFile, 'r')
  newresource += f.read()

  # Signify the end of the body
  newresource += ('--xxx--\n')

  headers = {'Content-Type': 'multipart/related; boundary=xxx'}
  resp, content = http.request(url, method="POST", body=newresource, headers=headers)
  print resp.status
  print resp

  if resp.status == 200:
    jsonResponse = json.loads(content)
    jobReference = jsonResponse['jobReference']['jobId']
    import time
    while True:
     jobCollection = service.jobs()
     getJob = jobCollection.get(projectId=projectId, jobId=jobReference).execute()
     currentStatus = getJob['status']['state']

     if 'DONE' == currentStatus:
      print "Done Loading!"
      return

     else:
      print 'Waiting to load...'
      print 'Current status: ' + currentStatus
      print time.ctime()
      time.sleep(10)

if __name__ == '__main__':
  main(sys.argv)

Marilu

unread,
Jul 3, 2014, 11:21:29 AM7/3/14
to gce-dis...@googlegroups.com
Hi Vikram,

The error seems to be related to the content type you're using and the one is being received.  
headers = {'Content-Type': 'multipart/related... and 'content-type': 'text/html; charset=UTF-8'

Marilu

Marilu

unread,
Jul 7, 2014, 2:20:07 PM7/7/14
to gce-dis...@googlegroups.com
Hi Vikram 

I notice that you also have an error status': '400',  which seems to be relate to your OAuth process.

I have run the script using the OAuth2WebServerFlow, flow.step1 and flow.step2, try it, it worked for me:

import sys
import json
import urllib
import httplib2

from oauth2client.client import OAuth2WebServerFlow
from apiclient.discovery import build
from oauth2client.file import Storage
from oauth2client.tools import run
from oauth2client.client import AccessTokenRefreshError
from apiclient.errors import HttpError


#Construct URLS
SERVICE_ACCOUNT = 'default'
GOOGLE_STORAGE_PROJECT_NUMBER = 'your-project-ID'
CLIENT_SECRET = 'your-client-secret'
REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
API_VERSION = "v1"
PROJECT_ID = "your-project-id"
URL_PREFIX = 'https://www.googleapis.com/upload/bigquery/v2/projects/' + PROJECT_ID + '/jobs'
REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'


#Obtain API authorizarion using OAuth2.0
def oauht_acces():  
   flow= OAuth2WebServerFlow(CLIENT_ID,CLIENT_SECRET,OAUTH_SCOPE,REDIRECT_URI)
   authorized_url = flow.step1_get_authorize_url()
   print ' '
   print 'Go to the following link in your browser: ' , authorized_url
   print ' '
   code = raw_input ('Enter verification code : ').strip()
   credentials = flow.step2_exchange(code)

   if credentials is None or credentials.invalid:
     credentials = tools.run_flow(FLOW, storage, flags)
   http = httplib2.Http()
   auth_http = credentials.authorize(http)
   storage = Storage('big_query')
   storage.put(credentials) 
   return auth_http


def loadTable(http, service):
  datasetId = 'your-datasetId'
  tableId = 'your-table'

  newSchemaFile = raw_input("What is your schema? ")
  schema = open(newSchemaFile, 'r')

  # Create the body of the request, separated by a boundary of xxx
  newresource = ('--xxx\n' +
            'Content-Type: application/json; charset=UTF-8\n' + '\n' +
            '{\n' +
            '   "configuration": {\n' +
            '     "load": {\n' +
            '       "schema": {\n'
            '         "fields": ' + schema.read() + '\n' +
            '      },\n' +
            '      "destinationTable": {\n' +
            '        "projectId": "' + PROJECT_ID + '",\n' +
            '        "datasetId": "' + datasetId + '",\n' +
            '        "tableId": "' + tableId + '"\n' +
            '      }\n' +
            '    }\n' +
            '  }\n' +
            '}\n' +
            '--xxx\n' +
            'Content-Type: application/octet-stream\n' +
            '\n')
  newDataFile = raw_input("What is your data? ")

  # Append data from the specified file to the request body
  f = open(newDataFile, 'r')
  newresource += f.read()

  # Signify the end of the body
  newresource += ('--xxx--\n')

  headers = {'Content-Type': 'multipart/related; boundary=xxx'}
  resp, content = http.request(URL_PREFIX, method="POST", body=newresource, headers=headers)

  if resp.status == 200:
    jsonResponse = json.loads(content)
    jobReference = jsonResponse['jobReference']['jobId']
    import time
    while True:
     jobCollection = service.jobs()
     getJob = jobCollection.get(projectId=PROJECT_ID, jobId=jobReference).execute()
     currentStatus = getJob['status']['state']

     if 'DONE' == currentStatus:
      print "Done Loading!"
      return

     else:
      print 'Waiting to load...'
      print 'Current status: ' + currentStatus
      print time.ctime()
      time.sleep(10)


def main():
  # Get the authorization and then upload file

  auth_http = oauht_acces();
  service = build('bigquery','v2', http=auth_http)

  loadTable(auth_http, service)

if __name__ == '__main__':
  main()
Reply all
Reply to author
Forward
0 new messages