Speedup recursive composite strategy

9 views

Skip to first unread message

Yclept Nemo

unread,

Sep 3, 2023, 6:52:32 PM9/3/23

to Hypothesis users

I have a recursive composite strategy that generates a small amount of data for each example, yet takes about a minute to generate an example. How can I improve the efficiency of this strategy?

>>> import re
>>> import typing
>>> import itertools
>>> import dataclasses
>>>
>>> from test.support.hypothesis_helper import hypothesis
>>>
>>>
>>> @dataclasses.dataclass
>>> class Token:
>>> category: typing.Any
>>> value: str
>>> delim: bool = False
>>>
>>>
>>> class Entry(list):
>>> @classmethod
>>> def from_empty(cls, length, category, value=""):
>>> tokens = []
>>> for index in range(0, length):
>>> if index > 0:
>>> tokens.append(Token(category, value, True))
>>> tokens.append(Token(category, value, False))
>>> return cls(tokens)
>>>
>>> def __repr__(self):
>>> return type(self).__name__ + super().__repr__()
>>>
>>> @property
>>> def as_text(self):
>>> return "".join(token.value for token in self)
>>>
>>> @property
>>> def as_data(self):
>>> data = []
>>> keyfunc = lambda token: token.delim
>>> for key, group in itertools.groupby(self, key=keyfunc):
>>> if key:
>>> continue
>>> text = "".join(token.value for token in group)
>>> data.append(text if text else None)
>>> return tuple(data)
>>>
>>>
>>> @hypothesis.strategies.composite
>>> def identities_strategy(draw):
>>> strategies = hypothesis.strategies
>>> random = draw(strategies.randoms())
>>> char_opts = {"blacklist_categories": ("Cs",)}
>>>
>>> def merge_char_opts(opts1, opts2):
>>> pass
>>>
>>> def choices_bernoulli(population, weights, failure_weight, min_size=0):
>>> # We'd have to sample the negative binomial distribution to
>>> # determine 'k' from the probability 'failure_weight', so
>>> # instead we simulate a bernoulli process which stops at the
>>> # first failure.
>>> fail = object()
>>> results = []
>>> while True:
>>> choice = random.choices\
>>> ((*population, fail), (*weights, failure_weight), k=1)[0]
>>> if choice is fail and len(results) < min_size:
>>> continue
>>> if choice is fail:
>>> break
>>> results.append(choice)
>>> return results
>>>
>>> def choose_bool(true=50, false=50):
>>> return random.choices\
>>> ((True, False), (true, false))[0]
>>>
>>> def insert_random(seq, item):
>>> index = random.randint(0, len(seq))
>>> return seq[:index] + item + seq[index:]
>>>
>>> def generate_text\
>>> ( *regexes_sub_group1_with_repl
>>> , text_blacklist_chars="", repl_blacklist_chars=""
>>> , min_size=0, max_size=None
>>> ):
>>> text_chars = strategies.characters\
>>> (**char_opts, blacklist_characters=text_blacklist_chars)
>>> repl_chars = strategies.characters\
>>> (**char_opts, blacklist_characters=repl_blacklist_chars)
>>> repl_fn = lambda m: m.group(1) + draw(repl_chars)
>>> text = draw(strategies.text
>>> (alphabet=text_chars, min_size=min_size, max_size=max_size))
>>> for regex in regexes_sub_group1_with_repl:
>>> text = re.sub(regex, repl_fn, text)
>>> return text
>>>
>>> def entries_combined():
>>> text = ""
>>> authors = set()
>>> maintainers = set()
>>>
>>> entry_types = (name_entry, ident_entry)
>>> entry_targets = (authors, maintainers)
>>> entry_prefixes =\
>>> ( "Author", "Maintainer"
>>> , "Author-email", "Maintainer-email"
>>> )
>>> combinations = zip\
>>> ( ((i,j) for i in entry_types for j in entry_targets)
>>> , entry_prefixes
>>> )
>>>
>>> for (entry_type, entry_target), entry_prefix in combinations:
>>> count, data_set, data_str = entries(entry_type)
>>> if not count:
>>> continue
>>> text += entry_prefix + ": " + data_str + "\n"
>>> entry_target |= data_set
>>>
>>> return (text, authors, maintainers)
>>>
>>> def entries(entry_function):
>>> count = draw(strategies.integers(min_value=0, max_value=20))
>>> entry_list = []
>>> for i in range(count):
>>> entry = entry_function()
>>> if i < count - 1:
>>> entry.append(Token(entries, ", ", True))
>>> entry_list.append(entry)
>>> unbalance(entry_list)
>>> return process(entry_list)
>>>
>>> def process(entries):
>>> text = ""
>>> data = set()
>>>
>>> for entry in entries:
>>> entry_text = entry.as_text
>>> entry_data = entry.as_data
>>> text += entry_text
>>> if all(elem is None for elem in entry_data):
>>> continue
>>> data.add(entry_data)
>>>
>>> return (len(entries), data, text)
>>>
>>> def unbalance(entries):
>>> index_candidates = []
>>> type_candidates =\
>>> [ ident_addr_domain_other
>>> , ident_addr_local_other
>>> , ident_name_other
>>> , name_value_other
>>> ]
>>> stop = False
>>> i = len(entries) - 1
>>> while i >= 0 and not stop:
>>> j = len(entries[i]) - 1
>>> while j >= 0 and not (stop:=entries[i][j] is quote):
>>> if entries[i][j] in type_candidates:
>>> index_candidates.append((i,j))
>>> j -= 1
>>> i -= 1
>>> if not index_candidates:
>>> return False
>>> for qchr in "\"'":
>>> if choose_bool():
>>> continue
>>> idx_entry, idx_part = random.choice(index_candidates)
>>> part = entries[idx_entry][idx_part]
>>> part = (part[0], insert_random(part[1], qchr))
>>> entries[idx_entry][idx_part] = part
>>> return True
>>>
>>> def name_entry():
>>> return random.choices\
>>> ( (name_empty, name_value), (10, 90)
>>> )[0]()
>>>
>>> def name_empty():
>>> return Entry.from_empty(2, name_empty)
>>>
>>> def name_value():
>>> functions = choices_bernoulli\
>>> ( (quote, name_value_other)
>>> , (25, 75), 10, min_size=1
>>> )
>>> entry = Entry.from_empty(2, name_value)
>>> entry[:1] = [f() for f in functions]
>>> return entry
>>>
>>> def ident_entry():
>>> return random.choices\
>>> ( ( ident_empty
>>> , ident_name_only
>>> , ident_addr_only
>>> , ident_name_addr
>>> )
>>> , (10, 20, 20, 50)
>>> )[0]()
>>>
>>> def ident_empty():
>>> return Entry.from_empty(2, ident_empty)
>>>
>>> def ident_name_only():
>>> entry = Entry.from_empty(2, ident_name_only)
>>> entry[:1] = ident_name()
>>> return entry
>>>
>>> def ident_addr_only():
>>> entry = Entry.from_empty(2, ident_addr_only)
>>> entry[2:] = ident_addr()
>>> return entry
>>>
>>> def ident_name_addr():
>>> name = ident_name()
>>> delim = ident_name_addr_delim(name)
>>> addr = ident_addr()
>>> return Entry((*name, delim, *addr))
>>>
>>> def ident_name_addr_delim(name):
>>> min_size = 0 if name[-1].value.endswith("@") else 1
>>> delim = draw(strategies.text(alphabet=" ", min_size=min_size))
>>> delim = Token(ident_name_addr_delim, delim, True)
>>> return delim
>>>
>>> def ident_name():
>>> functions = choices_bernoulli\
>>> ( (quote, ident_name_other)
>>> , (25, 75), 10, min_size=1
>>> )
>>> return [f() for f in functions]
>>>
>>> def ident_addr():
>>> local = ident_addr_local()
>>> domain = ident_addr_domain()
>>> result = [*local, Token(ident_addr, "@"), *domain]
>>> true_false = [20, 80]
>>> if choose_bool(*true_false):
>>> result = [Token(ident_addr, "<"), *result]
>>> true_false = [50, 50]
>>> if choose_bool(*true_false):
>>> result = [*result, Token(ident_addr, ">")]
>>> return result
>>>
>>> def ident_addr_local():
>>> functions = choices_bernoulli\
>>> ( (quote, ident_addr_local_other)
>>> , (25, 75), 10, min_size=1
>>> )
>>> return [f() for f in functions]
>>>
>>> def ident_addr_domain():
>>> functions = choices_bernoulli\
>>> ( (quote, ident_addr_domain_other)
>>> , (25, 75), 10, min_size=1
>>> )
>>> return [f() for f in functions]
>>>
>>> def quote():
>>> qchr = draw(strategies.sampled_from("'\""))
>>> escape_qchar = re.compile(rf"(?<!\\)((?:\\.)*)({qchr})")
>>> text_chars = strategies.characters(**char_opts)
>>> text = draw(strategies.text(alphabet=text_chars, min_size=0))
>>> text = escape_qchar.sub(r"\1\\\2", text)
>>> text = qchr + text + qchr
>>> return Token(quote, text)
>>>
>>> def ident_name_other():
>>> text = generate_text\
>>> ( r"([, ])$" , r"(,) "
>>> , text_blacklist_chars="\"'"
>>> , repl_blacklist_chars="\"', @"
>>> , min_size=1, max_size=20
>>> )
>>> text = re.sub(r"[^ ]@", " @", text)
>>> return Token(ident_name_other, text)
>>>
>>> def ident_addr_local_other():
>>> text = generate_text\
>>> (text_blacklist_chars="\"' @", min_size=1, max_size=20)
>>> return Token(ident_addr_local_other, text)
>>>
>>> def ident_addr_domain_other():
>>> text = generate_text\
>>> ( r"(,) "
>>> , text_blacklist_chars="\"'"
>>> , repl_blacklist_chars="\"', "
>>> , min_size=1, max_size=20
>>> )
>>> return Token(ident_addr_domain_other, text)
>>>
>>> def name_value_other():
>>> text = generate_text\
>>> ( r"(,) "
>>> , text_blacklist_chars="\"'"
>>> , repl_blacklist_chars="\"' "
>>> , min_size=1, max_size=20
>>> )
>>> return Token(name_value_other, text)
>>>
>>> return entries_combined()

Reply all

Reply to author

Forward

0 new messages