Speedup recursive composite strategy

9 views
Skip to first unread message

Yclept Nemo

unread,
Sep 3, 2023, 6:52:32 PM9/3/23
to Hypothesis users
I have a recursive composite strategy that generates a small amount of data for each example, yet takes about a minute to generate an example. How can I improve the efficiency of this strategy?


>>> import re
>>> import typing
>>> import itertools
>>> import dataclasses
>>>
>>> from test.support.hypothesis_helper import hypothesis
>>>
>>>
>>> @dataclasses.dataclass
>>> class Token:
>>>     category: typing.Any
>>>     value: str
>>>     delim: bool = False
>>>
>>>
>>> class Entry(list):
>>>     @classmethod
>>>     def from_empty(cls, length, category, value=""):
>>>         tokens = []
>>>         for index in range(0, length):
>>>             if index > 0:
>>>                 tokens.append(Token(category, value, True))
>>>             tokens.append(Token(category, value, False))
>>>         return cls(tokens)
>>>
>>>     def __repr__(self):
>>>         return type(self).__name__ + super().__repr__()
>>>
>>>     @property
>>>     def as_text(self):
>>>         return "".join(token.value for token in self)
>>>
>>>     @property
>>>     def as_data(self):
>>>         data = []
>>>         keyfunc = lambda token: token.delim
>>>         for key, group in itertools.groupby(self, key=keyfunc):
>>>             if key:
>>>                 continue
>>>             text = "".join(token.value for token in group)
>>>             data.append(text if text else None)
>>>         return tuple(data)
>>>
>>>
>>> @hypothesis.strategies.composite
>>> def identities_strategy(draw):
>>>     strategies = hypothesis.strategies
>>>     random = draw(strategies.randoms())
>>>     char_opts = {"blacklist_categories": ("Cs",)}
>>>
>>>     def merge_char_opts(opts1, opts2):
>>>         pass
>>>
>>>     def choices_bernoulli(population, weights, failure_weight, min_size=0):
>>>         # We'd have to sample the negative binomial distribution to
>>>         # determine 'k' from the probability 'failure_weight', so
>>>         # instead we simulate a bernoulli process which stops at the
>>>         # first failure.
>>>         fail = object()
>>>         results = []
>>>         while True:
>>>             choice = random.choices\
>>>                 ((*population, fail), (*weights, failure_weight), k=1)[0]
>>>             if choice is fail and len(results) < min_size:
>>>                 continue
>>>             if choice is fail:
>>>                 break
>>>             results.append(choice)
>>>         return results
>>>
>>>     def choose_bool(true=50, false=50):
>>>         return random.choices\
>>>             ((True, False), (true, false))[0]
>>>
>>>     def insert_random(seq, item):
>>>         index = random.randint(0, len(seq))
>>>         return seq[:index] + item + seq[index:]
>>>
>>>     def generate_text\
>>>         ( *regexes_sub_group1_with_repl
>>>         , text_blacklist_chars="", repl_blacklist_chars=""
>>>         , min_size=0, max_size=None
>>>         ):
>>>         text_chars = strategies.characters\
>>>             (**char_opts, blacklist_characters=text_blacklist_chars)
>>>         repl_chars = strategies.characters\
>>>             (**char_opts, blacklist_characters=repl_blacklist_chars)
>>>         repl_fn = lambda m: m.group(1) + draw(repl_chars)
>>>         text = draw(strategies.text
>>>             (alphabet=text_chars, min_size=min_size, max_size=max_size))
>>>         for regex in regexes_sub_group1_with_repl:
>>>             text = re.sub(regex, repl_fn, text)
>>>         return text
>>>
>>>     def entries_combined():
>>>         text = ""
>>>         authors = set()
>>>         maintainers = set()
>>>
>>>         entry_types = (name_entry, ident_entry)
>>>         entry_targets = (authors, maintainers)
>>>         entry_prefixes =\
>>>             ( "Author", "Maintainer"
>>>             , "Author-email", "Maintainer-email"
>>>             )
>>>         combinations = zip\
>>>             ( ((i,j) for i in entry_types for j in entry_targets)
>>>             , entry_prefixes
>>>             )
>>>
>>>         for (entry_type, entry_target), entry_prefix in combinations:
>>>             count, data_set, data_str = entries(entry_type)
>>>             if not count:
>>>                 continue
>>>             text += entry_prefix + ": " + data_str + "\n"
>>>             entry_target |= data_set
>>>
>>>         return (text, authors, maintainers)
>>>
>>>     def entries(entry_function):
>>>         count = draw(strategies.integers(min_value=0, max_value=20))
>>>         entry_list = []
>>>         for i in range(count):
>>>             entry = entry_function()
>>>             if i < count - 1:
>>>                 entry.append(Token(entries, ", ", True))
>>>             entry_list.append(entry)
>>>         unbalance(entry_list)
>>>         return process(entry_list)
>>>
>>>     def process(entries):
>>>         text = ""
>>>         data = set()
>>>
>>>         for entry in entries:
>>>             entry_text = entry.as_text
>>>             entry_data = entry.as_data
>>>             text += entry_text
>>>             if all(elem is None for elem in entry_data):
>>>                 continue
>>>             data.add(entry_data)
>>>
>>>         return (len(entries), data, text)
>>>
>>>     def unbalance(entries):
>>>         index_candidates = []
>>>         type_candidates =\
>>>             [ ident_addr_domain_other
>>>             , ident_addr_local_other
>>>             , ident_name_other
>>>             , name_value_other
>>>             ]
>>>         stop = False
>>>         i = len(entries) - 1
>>>         while i >= 0 and not stop:
>>>             j = len(entries[i]) - 1
>>>             while j >= 0 and not (stop:=entries[i][j] is quote):
>>>                 if entries[i][j] in type_candidates:
>>>                     index_candidates.append((i,j))
>>>                 j -= 1
>>>             i -= 1
>>>         if not index_candidates:
>>>             return False
>>>         for qchr in "\"'":
>>>             if choose_bool():
>>>                 continue
>>>             idx_entry, idx_part = random.choice(index_candidates)
>>>             part = entries[idx_entry][idx_part]
>>>             part = (part[0], insert_random(part[1], qchr))
>>>             entries[idx_entry][idx_part] = part
>>>         return True
>>>
>>>     def name_entry():
>>>         return random.choices\
>>>             ( (name_empty, name_value), (10, 90)
>>>             )[0]()
>>>
>>>     def name_empty():
>>>         return Entry.from_empty(2, name_empty)
>>>
>>>     def name_value():
>>>         functions = choices_bernoulli\
>>>             ( (quote, name_value_other)
>>>             , (25, 75), 10, min_size=1
>>>             )
>>>         entry = Entry.from_empty(2, name_value)
>>>         entry[:1] = [f() for f in functions]
>>>         return entry
>>>
>>>     def ident_entry():
>>>         return random.choices\
>>>             ( ( ident_empty
>>>               , ident_name_only
>>>               , ident_addr_only
>>>               , ident_name_addr
>>>               )
>>>             , (10, 20, 20, 50)
>>>             )[0]()
>>>
>>>     def ident_empty():
>>>         return Entry.from_empty(2, ident_empty)
>>>
>>>     def ident_name_only():
>>>         entry = Entry.from_empty(2, ident_name_only)
>>>         entry[:1] = ident_name()
>>>         return entry
>>>
>>>     def ident_addr_only():
>>>         entry = Entry.from_empty(2, ident_addr_only)
>>>         entry[2:] = ident_addr()
>>>         return entry
>>>
>>>     def ident_name_addr():
>>>         name = ident_name()
>>>         delim = ident_name_addr_delim(name)
>>>         addr = ident_addr()
>>>         return Entry((*name, delim, *addr))
>>>
>>>     def ident_name_addr_delim(name):
>>>         min_size = 0 if name[-1].value.endswith("@") else 1
>>>         delim = draw(strategies.text(alphabet=" ", min_size=min_size))
>>>         delim = Token(ident_name_addr_delim, delim, True)
>>>         return delim
>>>
>>>     def ident_name():
>>>         functions = choices_bernoulli\
>>>             ( (quote, ident_name_other)
>>>             , (25, 75), 10, min_size=1
>>>             )
>>>         return [f() for f in functions]
>>>
>>>     def ident_addr():
>>>         local = ident_addr_local()
>>>         domain = ident_addr_domain()
>>>         result = [*local, Token(ident_addr, "@"), *domain]
>>>         true_false = [20, 80]
>>>         if choose_bool(*true_false):
>>>             result = [Token(ident_addr, "<"), *result]
>>>             true_false = [50, 50]
>>>         if choose_bool(*true_false):
>>>             result = [*result, Token(ident_addr, ">")]
>>>         return result
>>>
>>>     def ident_addr_local():
>>>         functions = choices_bernoulli\
>>>             ( (quote, ident_addr_local_other)
>>>             , (25, 75), 10, min_size=1
>>>             )
>>>         return [f() for f in functions]
>>>
>>>     def ident_addr_domain():
>>>         functions = choices_bernoulli\
>>>             ( (quote, ident_addr_domain_other)
>>>             , (25, 75), 10, min_size=1
>>>             )
>>>         return [f() for f in functions]
>>>
>>>     def quote():
>>>         qchr = draw(strategies.sampled_from("'\""))
>>>         escape_qchar = re.compile(rf"(?<!\\)((?:\\.)*)({qchr})")
>>>         text_chars = strategies.characters(**char_opts)
>>>         text = draw(strategies.text(alphabet=text_chars, min_size=0))
>>>         text = escape_qchar.sub(r"\1\\\2", text)
>>>         text = qchr + text + qchr
>>>         return Token(quote, text)
>>>
>>>     def ident_name_other():
>>>         text = generate_text\
>>>             ( r"([, ])$" , r"(,) "
>>>             , text_blacklist_chars="\"'"
>>>             , repl_blacklist_chars="\"', @"
>>>             , min_size=1, max_size=20
>>>             )
>>>         text = re.sub(r"[^ ]@", " @", text)
>>>         return Token(ident_name_other, text)
>>>
>>>     def ident_addr_local_other():
>>>         text = generate_text\
>>>             (text_blacklist_chars="\"' @", min_size=1, max_size=20)
>>>         return Token(ident_addr_local_other, text)
>>>
>>>     def ident_addr_domain_other():
>>>         text = generate_text\
>>>             ( r"(,) "
>>>             , text_blacklist_chars="\"'"
>>>             , repl_blacklist_chars="\"', "
>>>             , min_size=1, max_size=20
>>>             )
>>>         return Token(ident_addr_domain_other, text)
>>>
>>>     def name_value_other():
>>>         text = generate_text\
>>>             ( r"(,) "
>>>             , text_blacklist_chars="\"'"
>>>             , repl_blacklist_chars="\"' "
>>>             , min_size=1, max_size=20
>>>             )
>>>         return Token(name_value_other, text)
>>>
>>>     return entries_combined()
Reply all
Reply to author
Forward
0 new messages