Hi,
I am writing a doit task generator for the following situation:
1. The source is one or more directories with potentially thousands of files it it each.
2. The first task takes that source, does some data transformation and spits out one or more directories as targets
3. The second source takes the directories from the previous step and compiles them to a single file.
4. The system goes on to use that file as a file_dep for later stages.
If I change any file in the source and run the second task, I want the first to be rerun again. Same for when I run a task that depends on the output of the second task.
The generated code might look something like this:
from pathlib import Path
import doit.tools
def task_source():
actions = []
if not Path("test").exists():
actions.append((doit.tools.create_folder, ["test.dir"]))
actions.append("echo foo > test.dir/foo.txt") # Imagine thousansds of files
return {
"actions": actions,
"targets": ["test.dir"], # output: directory
}
def task_first():
return {
"actions": [
(doit.tools.create_folder, ["test2.dir"]),
"cat test.dir/foo.txt > test2.dir/foso.txt", # Random modification of potentially thousands of files
],
"file_dep": ["test.dir"], # input: directory
"targets": ["test2.dir"], # output: directory
}
def task_second():
return {
"actions": ["cat test2.dir/foso.txt > foso.txt",],
"file_dep": ["test2.dir"], # input: directory
"targets": ["foso.txt"], # output: file
}
Now, doit will not take directories as file_deps, so task_first and task_second will blow up. I could make it so that task_first gets its file_deps filled with pathlib.Path("dest.dir").glob("**/*"), but what to do about task_second? Its input doesn't exist yet at this stage.
After reading
https://pydoit.org/dependencies.html, I am unclear about how either calc_dep or uptodate or both would help me here. I experimented with adding an `uptodate` key like this:
import os
from pathlib import Path
from typing import Any, Mapping
import doit.dependency
import doit.task
import doit.tools
def task_source():
actions = []
if not Path("test.dir").exists():
actions.append((doit.tools.create_folder, ["test.dir"]))
actions.append("echo foo > test.dir/foo.txt") # Imagine thousands of files
return {
"actions": actions,
"targets": ["test.dir"], # output: directory
}
def task_first():
return {
"actions": [
(doit.tools.create_folder, ["test2.dir"]),
"type test.dir\\foo.txt > test2.dir\\foso.txt", # Random modification of potentially thousands of files
],
"file_dep": ["test.dir"], # input: directory
"targets": ["test2.dir"], # output: directory
"uptodate": [uptodate],
}
def task_second():
return {
"actions": ["type test2.dir\\foso.txt > foso.txt",],
"file_dep": ["test2.dir"], # input: directory
"targets": ["foso.txt"], # output: file
"uptodate": [uptodate],
}
def uptodate(task: doit.task.Task, values: Mapping[str, str]) -> bool:
# Compute MD5 sums for both files and directories containing files.
file_deps = [Path(f) for f in task.file_dep]
md5s = {}
if file_deps[0].suffix == ".dir":
task.file_dep.clear() # file_deps must not contain directories.
for p in file_deps[0].glob("**/*"):
path = os.fspath(p)
task.file_dep.add(path) # Fill in actual files.
md5s[path] = doit.dependency.get_file_md5(p)
else:
md5s[path] = doit.dependency.get_file_md5(file_deps[0])
# Save current MD5 sums for future runs (where they will be in `values`).
task.value_savers.append(lambda: md5s)
# No values means this is the first run, which always means the task is
# out to date. The code above must run so we have some values on subsequent
# runs.
if not values:
return False
# Now compare the current MD5 sums to the ones from the previous run.
for filepath, new_md5 in md5s.items():
old_md5 = values.get(filepath)
if old_md5 is None or new_md5 != old_md5:
return False
return True
It seems to work for this case. Is this an acceptable way of dealing with this or am I misusing the uptodate mechanism here?