runners: emr: aws_access_key_id: xxxxx aws_secret_access_key: xxxxx aws_region: eu-west-1 ec2_key_pair: EMR ec2_key_pair_file: /Users/antoinerigoureau/Documents/emr.pem ssh_tunnel: true ec2_instance_type: m3.xlarge ec2_master_instance_type: m3.xlarge num_ec2_instances: 1 cmdenv: TZ: Europe/Paris bootstrap_python: false bootstrap: - curl -s https://s3-eu-west-1.amazonaws.com/data-essence/utils/boostrap.sh | sudo bash -s - sudo pip install -r req.txt# upload_archives: - /Users/antoinerigoureau/Documents/Essence/data/geoData/urba_france.zip#data upload_files: - /Users/antoinerigoureau/Documents/Essence/Source/venv_parallel/normalize.py - /Users/antoinerigoureau/Documents/Essence/Source/venv_parallel/compute_features.py python_bin: /usr/local/bin/python3.5 enable_emr_debugging: True setup: - source /usr/local/ripple/venv/bin/activate local: upload_archives: - /Users/antoinerigoureau/Documents/Essence/data/geoData/urba_france.zip#data
#/bin/bash
set -eset -xyum update -y
# install yum packagesyum install -y gcc\ geos-devel\ gcc-c++\ atlas-sse3-devel\ lapack-devel\ libpng-devel\ freetype-devel\ zlib-devel\ ncurses-devel\ readline-devel\ patch\ make\ libtool\ curl\ openssl-devel\ screen
pushd $HOME
# install python
rm -rf Python-3.5.1.tgz tar -xzvf Python-3.5.1.tgzpushd Python-3.5.1./configuremake -j 4make installpopd
export PATH=/usr/local/bin:$PATHecho export PATH=/usr/local/bin:\$PATH: > /etc/profile.d/usr_local_path.shchmod +x /etc/profile.d/usr_local_path.sh
pip3.5 install --upgrade pip virtualenv
mkdir -p /usr/local/ripple/venvvirtualenv /usr/local/ripple/venvsource /usr/local/ripple/venv/bin/activate
# install gdal
rm -rf gdal191.zip unzip gdal191.zip
## Here is the trick I had to add to get around the following -fPIC error# /usr/bin/ld: /root/gdal-1.9.1/frmts/o/.libs/aaigriddataset.o: relocation R_X86_64_32S against `vtable for AAIGRasterBand' can not be used when making a shared object; recompile with -fPIC#
pushd gdal-1.9.1./configureCC="gcc -fPIC" CXX="g++ -fPIC" make -j4make installpopd
export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}echo export LD_LIBRARY_PATH=/usr/local/lib:\$LD_LIBRARY_PATH > /etc/profile.d/gdal_library_path.shchmod +x /etc/profile.d/gdal_library_path.sh
Probable cause of failure:
R/W/S=1749/0/0 in:NA [rec/s] out:NA [rec/s]
minRecWrittenToEnableSkip_=9223372036854775807 HOST=null
USER=hadoop
HADOOP_USER=null
last tool output: |null|
java.io.IOException: Broken pipe
at java.io.FileOutputStream.writeBytes(Native Method)
at java.io.FileOutputStream.write(FileOutputStream.java:345)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:122)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at org.apache.hadoop.streaming.io.TextInputWriter.writeUTF8(TextInputWriter.java:72)
at org.apache.hadoop.streaming.io.TextInputWriter.writeValue(TextInputWriter.java:51)
at org.apache.hadoop.streaming.PipeMapper.map(PipeMapper.java:106)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:65)
at org.apache.hadoop.streaming.PipeMapRunner.run(PipeMapRunner.java:34)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:432)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:170)
(from lines 48-72 of s3://mrjob-a2eecd466d5ad7c2/tmp/logs/j-3ORB23VXC7XKL/task-attempts/application_1464009285050_0001/container_1464009285050_0001_01_000006/syslog.gz)
caused by:
+ /usr/local/bin/python3.5 test_mrjob.py --step-num=0 --mapper
Traceback (most recent call last):
File "test_mrjob.py", line 2, in <module>
import numpy as np
ImportError: No module named 'numpy'