Error when uploading files to Dataverse dataset with Python script

153 views
Skip to first unread message

Philipp Conzett

unread,
Dec 19, 2024, 10:13:16 AM12/19/24
to Dataverse Users Community
One of our depositors is trying to upload some files to his dataset at DataverseNO using a Python script [1], but gets the following error message:

Complete upload response: {"status":"ERROR","message":"Only authenticated users can perform the requested operation"}
Failed to complete multipart upload
Failed to process CD44.zip

Any idea what is going wrong here?

Best,
Philipp


[1] Python file upload script:

import os
import json
import requests
import hashlib

# Configuration
API_TOKEN = "XXXX"
SERVER_URL = "https://dataverse.no"
PERSISTENT_ID = "doi:10.18710/DIGQGQ"
FILES_PATH = r"J:\Downloads\transfer_369660_files_a6d0b3f0"

def get_upload_urls(file_size):
"""Step 1: Request upload URLs from Dataverse"""
headers = {"X-Dataverse-key": API_TOKEN}
url = f"{SERVER_URL}/api/datasets/:persistentId/uploadurls"
params = {
"persistentId": PERSISTENT_ID,
"size": file_size
}
response = requests.get(url, headers=headers, params=params)
return response.json()

def upload_part(url, part_data):
"""Upload a single part to S3"""
headers = {"x-amz-tagging": "dv-state=temp"}
response = requests.put(url, headers=headers, data=part_data)
if response.status_code == 200:
return response.headers.get('ETag')
return None

def complete_multipart(complete_url, etags):
"""Complete the multipart upload"""
response = requests.put(complete_url, json=etags)
return response.status_code == 200

def register_file(filename, storage_id):
"""Register the uploaded file in the dataset"""
headers = {"X-Dataverse-key": API_TOKEN}
url = f"{SERVER_URL}/api/datasets/:persistentId/add"
params = {"persistentId": PERSISTENT_ID}

json_data = {
"description": f"Upload of {filename}",
"fileName": filename,
"mimeType": "application/zip",
"storageIdentifier": storage_id
}

files = {"jsonData": (None, json.dumps(json_data))}
response = requests.post(url, headers=headers, params=params, files=files)
return response.json()

def process_file(filename):
file_path = os.path.join(FILES_PATH, filename)
file_size = os.path.getsize(file_path)
print(f"\nProcessing {filename} ({file_size:,} bytes)")

# Step 1: Get upload URLs
print("Requesting upload URLs...")
upload_response = get_upload_urls(file_size)

if "data" not in upload_response:
print(f"Error getting upload URLs: {upload_response}")
return False

upload_data = upload_response["data"]
part_size = upload_data.get("partSize", file_size)

# Handle multipart or single upload
if "urls" in upload_data: # Multipart upload
print(f"Starting multipart upload with {len(upload_data['urls'])} parts")
etags = {}

with open(file_path, 'rb') as f:
for part_num in upload_data["urls"].keys():
print(f"Uploading part {part_num}...")
part_data = f.read(part_size)
if not part_data:
break

etag = upload_part(upload_data["urls"][part_num], part_data)
if etag:
etags[part_num] = etag
print(f"Part {part_num} uploaded successfully")
else:
print(f"Failed to upload part {part_num}")
return False

# Complete multipart upload
print("Completing multipart upload...")
complete_url = f"{SERVER_URL}{upload_data['complete']}"
if not complete_multipart(complete_url, etags):
print("Failed to complete multipart upload")
return False

else: # Single upload
print("Starting single file upload...")
with open(file_path, 'rb') as f:
if not upload_part(upload_data["url"], f.read()):
print("Failed to upload file")
return False

# Register file
print("Registering file in dataset...")
register_response = register_file(filename, upload_data["storageIdentifier"])
if register_response.get("status") == "OK":
print("File registered successfully")
return True
else:
print(f"Failed to register file: {register_response}")
return False

def main():
zip_files = [f for f in os.listdir(FILES_PATH) if f.endswith('.zip')]
print(f"Found {len(zip_files)} zip files to process")

for filename in zip_files:
try:
if process_file(filename):
print(f"Successfully processed {filename}")
else:
print(f"Failed to process {filename}")
except Exception as e:
print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
print("Starting upload process...")
main()
print("\nUpload process completed")

James Myers

unread,
Dec 19, 2024, 10:20:48 AM12/19/24
to dataverse...@googlegroups.com

My first guess would be that you’ve hit the timeout for the signed urls on that store:  dataverse.files.<id>.url-expiration-minutes . That defaults to 60 minutes so if the upload doesn’t complete in that time, calls to upload more parts or to finish the multipart upload would fail. (Note your store may not have that jvm option defined (so 60 minute default). It can be multiple hours if you intend to support uploads that are very large and/or transfer over networks that are slow.)

 

-- Jim

--
You received this message because you are subscribed to the Google Groups "Dataverse Users Community" group.
To unsubscribe from this group and stop receiving emails from it, send an email to dataverse-commu...@googlegroups.com.
To view this discussion visit https://groups.google.com/d/msgid/dataverse-community/db344ad2-0e2b-4115-ab35-1236125b48dan%40googlegroups.com.

Philipp Conzett

unread,
Dec 20, 2024, 2:17:02 AM12/20/24
to Dataverse Users Community
Thanks, Jim. That makes sense. I see we've set the timeout to 120 min, but it seems the depositor tried to upload files that together are more than 200 GB.

Philipp

KL

unread,
Dec 26, 2024, 5:08:43 PM12/26/24
to Dataverse Users Community
You can try to add timeout=None parameter in both the  requests.put method.
Reply all
Reply to author
Forward
0 new messages