Generate a million file from a template

137 views
Skip to first unread message

Abhinav Gogna

unread,
Aug 20, 2016, 1:51:11 PM8/20/16
to Clojure
Hello,

I am trying to generate lot of files using futures but it hasn't sped up the process that hoped for. Here is the code I am using. Can someone point what I am doing wrong?

(ns jsonworker.core
 
(:require [cheshire.core :refer :all ]))


(defn parse-json
 
[file-loc]
 
(parse-stream (clojure.java.io/reader file-loc)))

(def json-template (atom (parse-json "resources/individual_1918203.json")))

(def fol-location "/Users/json/json_output")

(defn update-individual [json-string]
 
(assoc-in json-string ["someInfo" "moreInfo"]
           
(rand-nth  (range 1000 1000000))))


(defn gen-files [folder-loc text]
 
(spit (str folder-loc "/newfile_" (rand-int 1e7)) text))


(defn run-me [x]
 
(future  
   
(dotimes [_ x]
     
(swap! json-template update-individual)
     
(gen-files fol-location @json-template))))


Thanks!

Erik Assum

unread,
Aug 20, 2016, 2:46:54 PM8/20/16
to clo...@googlegroups.com
I think you should move the future inside do times, since then you’ll get a future for each write.
Also, I don’t think you need to put your json-template in an atom:

(ns jsonworker.foo
(:require [cheshire.core :refer :all ]))


(defn parse-json
[file-loc]
(parse-stream (clojure.java.io/reader file-loc)))

(def json-template (parse-json "resources/foo.json"))

(def fol-location "/tmp/json_output")

(defn update-individual [json-string]
(assoc-in json-string ["someInfo" "moreInfo"]
(rand-nth (range 1000 1000000))))

(defn gen-files [folder-loc text]
(spit (str folder-loc "/newfile_" (rand-int 1e7)) text))

(defn run-me [x]
(dotimes [_ x]
(future
(gen-files fol-location (update-individual json-template)))))
> --
> You received this message because you are subscribed to the Google
> Groups "Clojure" group.
> To post to this group, send email to clo...@googlegroups.com
> Note that posts from new members are moderated - please be patient with your first post.
> To unsubscribe from this group, send email to
> clojure+u...@googlegroups.com
> For more options, visit this group at
> http://groups.google.com/group/clojure?hl=en
> ---
> You received this message because you are subscribed to the Google Groups "Clojure" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to clojure+u...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

hitesh

unread,
Aug 22, 2016, 3:10:05 PM8/22/16
to Clojure
This looks like it's doing too much work to simply generate a random integer.  Are you sure you want to build a lazy list of 999,000 integers and randomly select into it for every invocation?  The garbage collector will be working overtime.

(defn update-individual [json-string]
  
(assoc-in json-string ["someInfo" "moreInfo"]
            
(rand-nth  (range 1000 1000000))))

This should get you a random number in the range without as much effort.

(+ 1000 (rand-int 999000))

Abhinav Gogna

unread,
Aug 22, 2016, 10:03:05 PM8/22/16
to Clojure
Thanks Guys! Hitesh you were right about rand-nth. I switched to rand-int, which is much faster.

I found a way. It may not be the most optimal way but I can get about 50000 files in 2-3 secs.

I am using cheshire to parse json and pjson to write it back.

(ns jsonworker.core
 
(:require [cheshire.core :refer [parse-stream] ]
           
[pjson.core :refer [read-str write-str get-charset]]))



(defn parse-json
 
[file-loc]
 
(parse-stream (clojure.java.io/reader file-loc)))

(def json-template (parse-json "resources/test_1918203.json"))

(def fol-location "/Users/json/json_output")

(defn update-ref [json-string id]
 
(assoc-in json-string ["basicInformation" "myId"] id))


(defn gen-files [folder-loc text id]
 
(spit (str folder-loc "/test_"  id ".json") (write-str text)))



(defn run-me [x]
 
(dotimes [_ x]

   
(let [iid (rand-int 1e9)]
     
(gen-files fol-location (update-ref json-template iid) iid))))


(defn run-in-parallel
"run-in-parallel runs 500 different threads.
you can give each thread number of files you want to generate
Eg: run-in-parallel 100 will generate 500*100 = 50000 files"

 
[y]
 
(dotimes [_ 500]
   
(future
     
(.start (Thread. (run-me y))))))

Moritz Ulrich

unread,
Aug 23, 2016, 7:48:04 AM8/23/16
to Abhinav Gogna, Clojure

Abhinav Gogna <abhi...@gmail.com> writes:
> (defn run-in-parallel
> "run-in-parallel runs 500 different threads.
> you can give each thread number of files you want to generate
> Eg: run-in-parallel 100 will generate 500*100 = 50000 files"
> [y]
> (dotimes [_ 500]
> (future
> (.start (Thread. (run-me y))))))

`future` alread spawns a thread (or at least dispatches into a thread
pool) so you're actually spawning two threads here. You can either get
rid of the `future` or just replace it with `(future (run-me y))`.
Reply all
Reply to author
Forward
0 new messages