Dealing with directories in file_dep when some file

Nikolaus Waxweiler

unread,

Nov 29, 2019, 8:27:18 AM11/29/19

to python-doit

Hi,
I am writing a doit task generator for the following situation:

1. The source is one or more directories with potentially thousands of files it it each.
2. The first task takes that source, does some data transformation and spits out one or more directories as targets
3. The second source takes the directories from the previous step and compiles them to a single file.
4. The system goes on to use that file as a file_dep for later stages.

If I change any file in the source and run the second task, I want the first to be rerun again. Same for when I run a task that depends on the output of the second task.

The generated code might look something like this:

from pathlib import Path

import doit.tools


def task_source():
    actions = []
    if not Path("test").exists():
        actions.append((doit.tools.create_folder, ["test.dir"]))
        actions.append("echo foo > test.dir/foo.txt")  # Imagine thousansds of files
    return {
        "actions": actions,
        "targets": ["test.dir"],  # output: directory
    }


def task_first():
    return {
        "actions": [
            (doit.tools.create_folder, ["test2.dir"]),
            "cat test.dir/foo.txt > test2.dir/foso.txt",  # Random modification of potentially thousands of files
        ],
        "file_dep": ["test.dir"],  # input: directory
        "targets": ["test2.dir"],  # output: directory
    }


def task_second():
    return {
        "actions": ["cat test2.dir/foso.txt > foso.txt",],
        "file_dep": ["test2.dir"],  # input: directory
        "targets": ["foso.txt"],  # output: file
    }

Now, doit will not take directories as file_deps, so task_first and task_second will blow up. I could make it so that task_first gets its file_deps filled with pathlib.Path("dest.dir").glob("**/*"), but what to do about task_second? Its input doesn't exist yet at this stage.

After reading https://pydoit.org/dependencies.html, I am unclear about how either calc_dep or uptodate or both would help me here. I experimented with adding an `uptodate` key like this:

import os
from pathlib import Path
from typing import Any, Mapping

import doit.dependency
import doit.task
import doit.tools


def task_source():
    actions = []
    if not Path("test.dir").exists():
        actions.append((doit.tools.create_folder, ["test.dir"]))
        actions.append("echo foo > test.dir/foo.txt")  # Imagine thousands of files
    return {
        "actions": actions,
        "targets": ["test.dir"],  # output: directory
    }


def task_first():
    return {
        "actions": [
            (doit.tools.create_folder, ["test2.dir"]),
            "type test.dir\\foo.txt > test2.dir\\foso.txt",  # Random modification of potentially thousands of files
        ],
        "file_dep": ["test.dir"],  # input: directory
        "targets": ["test2.dir"],  # output: directory
        "uptodate": [uptodate],
    }


def task_second():
    return {
        "actions": ["type test2.dir\\foso.txt > foso.txt",],
        "file_dep": ["test2.dir"],  # input: directory
        "targets": ["foso.txt"],  # output: file
        "uptodate": [uptodate],
    }


def uptodate(task: doit.task.Task, values: Mapping[str, str]) -> bool:
    # Compute MD5 sums for both files and directories containing files.
    file_deps = [Path(f) for f in task.file_dep]
    md5s = {}
    if file_deps[0].suffix == ".dir":
        task.file_dep.clear()  # file_deps must not contain directories.
        for p in file_deps[0].glob("**/*"):
            path = os.fspath(p)
            task.file_dep.add(path)  # Fill in actual files.
            md5s[path] = doit.dependency.get_file_md5(p)
    else:
        md5s[path] = doit.dependency.get_file_md5(file_deps[0])

    # Save current MD5 sums for future runs (where they will be in `values`).
    task.value_savers.append(lambda: md5s)

    # No values means this is the first run, which always means the task is
    # out to date. The code above must run so we have some values on subsequent
    # runs.
    if not values:
        return False

    # Now compare the current MD5 sums to the ones from the previous run.
    for filepath, new_md5 in md5s.items():
        old_md5 = values.get(filepath)
        if old_md5 is None or new_md5 != old_md5:
            return False

    return True

It seems to work for this case. Is this an acceptable way of dealing with this or am I misusing the uptodate mechanism here?

Nikolaus Waxweiler

unread,

Nov 29, 2019, 8:43:51 AM11/29/19

to python-doit

(Specifically, the clearing of file_deps from uptodate seems dangerous to me but is needed to prevent it from MD5ing the directory despite uptodate -- which makes me wonder if doit would MD5 everything twice)

Nikolaus Waxweiler

unread,

Nov 29, 2019, 12:26:02 PM11/29/19

to python-doit

Actually, after reading the uptodate section again, I came up with the following uptodate funtion:

def uptodate(task: doit.task.Task, values: Mapping[str, str]) -> None:
    """Replace directories in a task's file_dep with the files it contains.
    
    This function is called by doit after the task's dependencies have been
    executed, so we can glob the contents now and update the task's file_dep.
    """


    file_deps = [Path(f) for f in task.file_dep]


    if file_deps[0].suffix == ".dir":
        task.file_dep.clear()  # file_deps must not contain directories.
        for p in file_deps[0].glob("**/*"):
            path = os.fspath(p)
            task.file_dep.add(path)  # Fill in actual files.



    # A None will be ignored by doit and it will invoke its usual machinery
    # to determine if the task is up-to-date, but now it will look at only
    # files in file_dep instead of a directory.
    return None

This seems to work as well and now doit presumably checks state just once.

I think all I want is a runtime-updating of directories in file_deps. The calculated-dependencies appraoch seems like more work? Am I missing something?

Eduardo Schettino

unread,

Dec 1, 2019, 4:39:23 AM12/1/19

to python-doit

> I think all I want is a runtime-updating of directories in file_deps. The calculated-dependencies appraoch seems like more work? Am I missing something?

Have you tried using calc_dep ? I think it would be less work and cleaner code.

I would never do the way you did, but sure what you did works.

Just note that `doit` matches `file_dep` and `targets` to build the dependency graph.

It works for your trivial example, but be careful.

calc_deps is an older feature, when it was created it was the only way to solve your use-case.

I would still use it for your use-case or maybe delayed task creation...

cheers

--
You received this message because you are subscribed to the Google Groups "python-doit" group.
To unsubscribe from this group and stop receiving emails from it, send an email to python-doit...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/python-doit/a7e06092-c3d9-47a6-afde-e3adc56ee46e%40googlegroups.com.

Nikolaus Waxweiler

unread,

Dec 2, 2019, 5:14:46 AM12/2/19

to python-doit

Hi!

thanks for getting back to me. I had a look at calc_deps again and came up with the following that works for my toy example:

import os
from pathlib import Path

import doit.dependency
import doit.task
import doit.tools


def task_source():
    actions = []
    if not Path("test.dir").exists():
        actions.append((doit.tools.create_folder, ["test.dir"]))
        actions.append("echo foo > test.dir/foo.txt")  # Imagine thousands of files
    return {
        "actions": actions,
        "targets": ["test.dir"],  # output: directory
    }


def task_first():
    return {
        "actions": [
            (doit.tools.create_folder, ["test2.dir"]),
            "type test.dir\\foo.txt > test2.dir\\foso.txt",  # Random modification of potentially thousands of files
        ],


        "targets": ["test2.dir"],  # output: directory


        "calc_dep": ["get_dep:test.dir"],


    }


def task_second():
    return {
        "actions": ["type test2.dir\\foso.txt > foso.txt",],


        "targets": ["foso.txt"],  # output: file


        "calc_dep": ["get_dep:test2.dir"],
    }


def get_dep(mod):
    paths = [os.fspath(p) for p in Path(mod).glob("**/*")]
    return {"file_dep": paths}


def task_get_dep():
    """get direct dependencies for each module"""
    for mod in ("test.dir", "test2.dir"):
        yield {
            "name": mod,
            "actions": [(get_dep, [mod])],
        }

I had to not put in any file_dep because doit would try to MD5 directories. The calc_dep tasks are always run, which I presume they have to. Is this usage correct?

Eduardo Schettino

unread,

Dec 8, 2019, 3:44:58 AM12/8/19

to python-doit

On Mon, Dec 2, 2019 at 6:14 PM Nikolaus Waxweiler <nikolaus....@daltonmaag.com> wrote:

I had to not put in any file_dep because doit would try to MD5 directories. The calc_dep tasks are always run, which I presume they have to. Is this usage correct?

`calc_dep` do not need to always run. It follows same rules as any other task.

It is a toy example, so not sure what your intentions are..

Also note you can add actions to an existing task to be used as `calc_dep`.

See my version of you toy example below (changed to work on linux system)

cheers

```

import os
from pathlib import Path

import doit.dependency
import doit.task
import doit.tools

def get_dep(mod):
paths = [os.fspath(p) for p in Path(mod).glob("**/*")]
return {"file_dep": paths}

def task_source():
actions = []
if not Path("test.dir").exists():
actions.append((doit.tools.create_folder, ["test.dir"]))
actions.append("echo foo > test.dir/foo.txt") # Imagine thousands of files

actions.append((get_dep, ['test.dir']))

return {
"actions": actions,
"targets": ["test.dir"], # output: directory
}

def task_first():
return {
"actions": [
(doit.tools.create_folder, ["test2.dir"]),

"md5sum test.dir/foo.txt > test2.dir/foso.txt", # Random modification of potentially thousands of files
(get_dep, ['test2.dir']),

],
"targets": ["test2.dir"], # output: directory

"calc_dep": ["source"],
}

def task_second():
return {
"actions": [
"md5sum test2.dir/foso.txt > foso.txt",

],
"targets": ["foso.txt"], # output: file

"calc_dep": ["first"],
}

```

Nikolaus Waxweiler

unread,

Dec 13, 2019, 5:55:11 AM12/13/19

to python-doit

Thank you for the pointer :) Didn't know actions can do that. Will experiment with the options a bit.

> It is a toy example, so not sure what your intentions are..

Enabling tasks to have directories in their file_dep at any point during doit run-time :)

Reply all

Reply to author

Forward

Dealing with directories in file_dep when some file_deps are generated by tasks

Nikolaus Waxweiler

Nikolaus Waxweiler

Nikolaus Waxweiler

Eduardo Schettino

Nikolaus Waxweiler

Eduardo Schettino

Nikolaus Waxweiler