This was a major leap for us at LinkedIn -- how to pull classes into Cfengine's execution in a safe and reliable way.
Ultimately, we ended up using a technology Yahoo! Opensourced called range:
https://github.com/ytoolshed/rangeDuring
failsafe.cf, our clients execute a python script which scrapes range for everything it knows about the node. We dump this in JSON format into /etc/range_classes.conf.
Another very basic bash script executes via usemodule which raises these classes.
Here's a sample of the Python code we execute:
# cat /var/cfengine/modules/module_probe_range_classes #!/usr/bin/python2.6import sys# Do not generate the .pyc files, otherwise, failsafe.cf purges them on each run.sys.dont_write_bytecode = True# Use range.py under /var/cfengine/modules. Linux has seco.range installed in /usr/lib/python2.6/site-packages/seco.# By making /var/cfengine/modules the first object in sys.path, we force use of our Cfengine maintained range.py. This way, if a RPM update screws# seco.range under /usr/lib/python2.6/site-packages, Cfengine still functions.sys.path.insert(0, '/var/cfengine/modules/python2.6/site-packages')import osimport signalimport jsonimport timeimport platformimport subprocessimport refrom optparse import OptionParserimport platformimport seco.rangeclass timeout_exception(Exception): pass####################################################################################################################def execute_range_query(current_querying_mps): if not current_querying_mps: return(0) range_clusters = [] range_data = {} def timeout_handler(signum, frame): raise timeout_exception() old_handler = signal.signal(signal.SIGALRM, timeout_handler) # set a 5 second alarm signal.alarm(5) if options.verbose: print "(+) Starting 5 second signal alarm" # Begin timeout exception try: try: # First, grab all clusters. Then on each cluster, grab all keys. Loop through each cluster+key and define a class. range_object = seco.range.Range(current_querying_mps) update_time = range_object.expand("%version")[0] if options.verbose: print "update_time from " + current_querying_mps + " is " + update_time except seco.range.RangeException, e: if options.verbose: print "mps is not responding to range requests" print "+mps_offline" print "=mps=" + current_querying_mps return(0) current_time = time.time() freshness = current_time - int(update_time) if options.verbose: print "current_time is " + str(current_time) print "freshess is " + str(freshness) # 60 seconds * 60 minutes = 3600 seconds if freshness > 3600: print "+range_data_invalid" print "=mps=" + current_querying_mps # print freshness out in minutes instead of seconds print "=freshness=" + str(freshness / 60) return(0) try: range_clusters = range_object.expand('%index:' + fqdn) except seco.range.RangeException, e: if "NO_CLUSTER" in str(e)\ or "NOCLUSTER" in str(e): print "+no_range_clusters" return(1) return(0) if range_clusters: try: for cluster in range_clusters: if options.verbose: print "(+) range cluster is " + cluster for KEYS in range_object.expand('%{'+ cluster +'}:KEYS'): for KEY in KEYS.splitlines(): if "CLUSTER" == KEY: range_data[cluster] = cluster else: if options.verbose: print "(+) key is " + KEY single_expansion = '%{'+ cluster +'}:' + KEY for object in range_object.expand(single_expansion): # single range metadata is the object that becomes the cfengine class. single_range_metadata = cluster + ":" + KEY + "_" + object single_range_query = cluster + ":" + KEY # here, we just set the actual range query next to the cfengine class in a dict so its easy to see from the # /etc/range_classes.conf file what range query was executed to grab this class value range_data[single_range_metadata] = single_range_query if options.verbose: print "(+) single expansion is " + single_range_metadata print "(+) range query is " + single_range_query # The only way we can determine a machine's fabric reliably is by looking at whatever glu-agent is configured with. # This is probably a hack, but, we can only discover range tags on glu deployed services. The below regular expression # match is literally looking at the range cluster returned by the glu agent. At some point in time, this will probably have to # be replaced with something more reliable -- but for the time being, this should give us range tags on 99% of our infrastrcuture. fabric = "" for key,value in range_data.iteritems(): if re.match('\w+.agent.1$', key): fabric = key.split('.')[0] if fabric: host_tag_query = fabric +'.host_tags:' + fqdn for HOST_TAGS in range_object.expand('%' + host_tag_query): if options.verbose: print "(+) Host tags is " + HOST_TAGS range_data["host_tags_" + fabric + "_" + HOST_TAGS] = host_tag_query except seco.range.RangeException, e: print "+invalid_range_data" if options.verbose: print "range exception was " + str(e) return(0) except timeout_exception: print "+range_server_timeout_exception" print "=mps=" + current_querying_mps if options.verbose: print "range timeout" return(0) finally: signal.signal(signal.SIGALRM, old_handler) signal.alarm(0) p = re.compile(r'[-+%.()=*,:\/"]') holder = "" for key,value in range_data.iteritems(): # Make sure our item is now alphanumeric after we substituted unscores for the periods, dashes and colons holder = "range_class_" + p.sub('_',key) if options.verbose: print "(+) holder is " + holder if re.match('[a-zA-z0-9_]',holder): range_classes[holder] = value else: print "+invalid_range_data" if options.verbose: print "invalid character string found. holder was " + holder return(0) return(1)####################################################################################################################if __name__ == '__main__': """ Query the range servers and set global classes within Cfengine based upon their output. When complete, dump data into a JSON file. If for whatever reason we can't query the range servers, then read the range class data from this file instead of querying the range servers directly. This allows for "persistant classes". """ parser = OptionParser(usage ="usage: %prog [options]", version ="%prog 1.0") parser.add_option("-v", "--verbose", action = "store_true", dest = "verbose", default = False, help = "Enable verbose execution") parser.add_option("-p", "--primiary", action = "store", dest = "mps1", help = "Which primary URL to query against? This is the primary MPS. REQUIRED") parser.add_option("-s", "--secondary", action = "store", dest = "mps2", help = "Which secondary URL to query against? This is the secondary MPS. REQUIRED") parser.add_option("-t", "--third", action = "store", dest = "mps3", help = "Which third URL to query against? This is the third MPS.") parser.add_option("-f", "--forth", action = "store", dest = "mps4", help = "Which forth URL to query against? This is the forth MPS.") parser.add_option("-n", "--fqdn", action = "store", dest = "fqdn", help = "What FQDN do you want to query for? This will not write to /etc/range_classes.conf, it will only print to STDOUT. The jumpstart servers use this option to determine if a client has range classes in use before reimaging the box") (options, args) = parser.parse_args() if options.mps1 is None and options.mps2 is None: print "Primary and secondary MPS or URLs are required for this script to operate. Exiting now." sys.exit(1) range_classes = {} fqdn = "" if not options.fqdn: # this is normal client execution. if "linkedin.com" not in platform.node(): fqdn = platform.node() + ".linkedin.com" else: fqdn = platform.node() else: # we've passed fqdn on the CLI, so, make sure linkedin.com is in it. if "linkedin.com" not in options.fqdn: fqdn = options.fqdn + ".linkedin.com" else: fqdn = options.fqdn if options.verbose: print "(+) fqdn is " + fqdn # Attempt to query both MPS for range data. If we don't get a valid response from either one, then bail. if not execute_range_query(options.mps1): if not execute_range_query(options.mps2): if not execute_range_query(options.mps3): if not execute_range_query(options.mps4): print "+invalid_range_data" if options.verbose: print "invalid range data from all master policy servers" sys.exit(1) if not options.fqdn: # This is normal client execution. We should be querying range and dumping the output to /etc/range_classes.conf to set Cfengine classes on. if options.verbose: print "(+) fqdn was not passed to the CLI. Dumping data to /etc/range_classes.conf" try: with open("/etc/range_classes.conf", mode="w") as fh: json.dump(range_classes, fh, sort_keys=True, indent=3) print "+successful_range_query_completed" except: print "+invalid_range_data" sys.exit(1) else: # We've specified a fqdn on the cli, so, print to stdout instead of dumping to /etc/range_classes.conf. The jumpstart servers execute this to determine if a machine has range classes against it or not. if options.verbose: print "(+) fqdn was passed to the CLI. Printing to stdout instead of dumping to /etc/range_classes.conf" for key in sorted(range_classes.iterkeys()): print key + " : " + range_classes[key] sys.exit(0)
This produces something similar to the following:
# cat /etc/range_classes.conf
{
"range_class_ech3_cdo_1": "ech3.cdo.1",
"range_class_ech3_cdo_1_URL__http___cdops_prod_linkedin_com_": "ech3.cdo.1:URL",
"range_class_inops_mfg_sun": "inops.mfg.sun",
"range_class_inops_model_sun_fire_x4170": "inops.model.sun_fire_x4170",
"range_class_inops_state_production": "inops.state.production",
"range_class_inops_type_device_server": "inops.type.device.server"
}
The LHS is used as Cfengine classes. The RHS is what we used to query range.
In
failsafe.cf, we execute this module by executing this:
agent.!range_classes_defined::
"range_classes_defined" expression => usemodule("module_probe_range_classes -p $(shared_global_environment.mps1)
-s $(shared_global_environment.mps2)
-t $(shared_global_environment.mps3)
-f $(shared_global_environment.mps4)","");
At this point, /etc/range_classes.conf is on disk. To make the classes active, we execute this simple bash script.
# Again, never ever disable this module. Cfengine will fall apart if you do. !define_range_classes:: "define_range_classes" expression => usemodule("module_define_range_classes.sh",""); # cat /var/cfengine/modules/module_define_range_classes.sh
#!/bin/bash
PATH=/usr/bin:/usr/sbin:/sbin:/bin
# All of Cfengine depends on this module returning the results from /etc/range_classes.conf. We query against range in failsafe.cf
# in module_probe_range_classes, which does nothing but put /etc/range_classes.conf down on disk. This module actually makes the
# classes active.
# If you disable this module, Linkedin goes offline. Dont fuck with it.
if [ -f /etc/range_classes.conf ]
then
for CFENGINE_CLASS in `grep range_class /etc/range_classes.conf | awk '{print $1}' | sed 's/\"//g' | sed 's/://'`
do
echo "+${CFENGINE_CLASS}"
done
fi