#!/usr/bin/env python # {{ ansible_managed }} # Author: Maurice Makaay, XS4ALL from __future__ import print_function import subprocess import re import json from os import system from sys import exit, stdin nodes = {{ groups['galera'] | to_json }} lxd_status = {{ lxd_status | to_json }} def propose(commands, exit_code): for command in commands: print("# %s" % command) print("") print("Execute now? [y/n]: ", end="") answer = stdin.readline() if "y" in answer.lower(): print("") for command in commands: print("EXECUTING> %s" % command) system(command) print("") exit(0) exit(exit_code) print("Collecting Galera status information from nodes ...") status = {} for node in nodes: lxd_host = "%s:%s" % (lxd_status[node]["host"], node) try: result = subprocess.check_output([ "lxc", "exec", lxd_host, "/root/galera_cluster_status"]) status[node] = json.loads(result) except subprocess.CalledProcessError: status[node] = { "cluster_size": 0, "cluster_status": 'Status Failed', "connected": "Unknown", "ready": "Unknown", "safe_to_bootstrap": 0, "seqno": -1, "uuid": None } status[node]['lxd_host'] = lxd_host status[node]['node'] = node def is_primary(s): return s["cluster_status"] == "Primary" def has_correct_cluster_size(s): return s["cluster_size"] == len(nodes) def is_connected(s): return s["connected"] == "ON" def is_ready(s): return s["ready"] == "ON" print("") print("%-20s %-15s %-6s %-12s %-7s" % ( "Node", "Status", "Size", "Connected", "Ready")) for node in nodes: s = status[node] print("%-20s %-15s %-6s %-12s %-7s" % ( node, s["cluster_status"], s["cluster_size"], s["connected"], s["ready"])) print("") print("Checking cluster status ...") print("") # ---------------------------------------------------------------------------- # CASE: All cluster nodes are up and running, green lights galore! # ---------------------------------------------------------------------------- all_primary = all(map(is_primary, status.values())) all_size_ok = all(map(has_correct_cluster_size, status.values())) all_connected = all(map(is_connected, status.values())) all_ready = all(map(is_ready, status.values())) if all([all_primary, all_size_ok, all_connected, all_ready]): print("There's no bootstrapping work to do here, all looks good!") print("") exit(0) # ---------------------------------------------------------------------------- # CASE: The cluster is parially down, but some cluster hosts are still ok. # ---------------------------------------------------------------------------- if any(map(is_primary, status.values())) and any(map(is_ready, status.values())): print("The cluster seems to be in a degraded status.") print("Please investigate the cluster status.") print("- Can the cluster hosts reach each other over the network?") print("- Are all mariadb instances running?") print("") print("It might help to (re)start the database server on the degraded node(s):") print("") commands = [ "lxc exec %s service mysql restart" % s["lxd_host"] for s in status.values() if not is_primary(s) ] propose(commands, 1) # ---------------------------------------------------------------------------- # CASE: All cluster nodes are down, one cluster node is safe to bootstrap. # ---------------------------------------------------------------------------- print("None of the cluster hosts is operational. A cluster bootup is required.") safe_to_bootstrap = [s for s in status.values() if s["safe_to_bootstrap"] == 1] if any(safe_to_bootstrap): bootstrap_node = safe_to_bootstrap[0] print("A node is marked as 'safe to bootstrap', so proposed strategy:") print("") commands = ["lxc exec %s galera_new_cluster" % bootstrap_node["lxd_host"]] for n, s in status.items(): if n == bootstrap_node["node"]: continue commands.append("lxc exec %s service mysql start" % s["lxd_host"]) propose(commands, 2) # ---------------------------------------------------------------------------- # CASE: All cluster nodes are down, no cluster node is safe to bootstrap. # ---------------------------------------------------------------------------- print("Unfortunately, none of the nodes is marked as safe to bootstrap.") print("Retrieving last recovered position for all cluster nodes ...") print("") print("%-20s %-15s %-40s" % ("Node", "Recovery pos", "UUID")) for n, s in status.items(): lxd_host = "%s:%s" % (lxd_status[n]["host"], n) try: result = subprocess.check_output([ "lxc", "exec", lxd_host, "/root/galera_wsrep_recovered_position"]) uuid_and_pos = json.loads(result) uuid, pos = re.split(':', uuid_and_pos, maxsplit=1) s["uuid"] = uuid s["pos"] = int(pos) except subprocess.CalledProcessError: s["uuid"] = "Unknown" s["pos"] = -1 print("%-20s %-15d %-40s" % (n, s["pos"], s["uuid"])) uuids = set((s["uuid"] for s in status.values())) if len(uuids) != 1: print("") print("Wow... now wait a minute... There are multiple UUID's in play!") print("That should never happen in a Galera cluster.") print("You will have to handle this one yourself I'm afraid.") def get_pos_key(x): return x["pos"] old_to_new = sorted(status.itervalues(), key=get_pos_key) bootstrap_node = old_to_new[-1] print("") print("Determined a node that is safe for bootstrapping, so proposed strategy:") print("") commands = [ "lxc exec %s /root/galera_flag_as_safe_to_bootstrap" % bootstrap_node["lxd_host"], "lxc exec %s galera_new_cluster" % bootstrap_node["lxd_host"] ] for n, s in status.items(): if n == bootstrap_node["node"]: continue commands.append("lxc exec %s service mysql start" % s["lxd_host"]) propose(commands, 3)