sidn-lxd-ansible-demo/roles/app.galera_bootstrap/templates/galera_bootstrap_advisor.j2

179 lines
5.9 KiB
Django/Jinja

#!/usr/bin/env python
# {{ ansible_managed }}
# Author: Maurice Makaay, XS4ALL
from __future__ import print_function
import subprocess
import re
import json
from os import system
from sys import exit, stdin
nodes = {{ groups['galera'] | to_json }}
lxd_status = {{ lxd_status | to_json }}
def propose(commands, exit_code):
for command in commands:
print("# %s" % command)
print("")
print("Execute now? [y/n]: ", end="")
answer = stdin.readline()
if "y" in answer.lower():
print("")
for command in commands:
print("EXECUTING> %s" % command)
system(command)
print("")
exit(0)
exit(exit_code)
print("Collecting Galera status information from nodes ...")
status = {}
for node in nodes:
lxd_host = "%s:%s" % (lxd_status[node]["host"], node)
try:
result = subprocess.check_output([
"lxc", "exec", lxd_host, "/root/galera_cluster_status"])
status[node] = json.loads(result)
except subprocess.CalledProcessError:
status[node] = {
"cluster_size": 0,
"cluster_status": 'Status Failed',
"connected": "Unknown",
"ready": "Unknown",
"safe_to_bootstrap": 0,
"seqno": -1,
"uuid": None
}
status[node]['lxd_host'] = lxd_host
status[node]['node'] = node
def is_primary(s):
return s["cluster_status"] == "Primary"
def has_correct_cluster_size(s):
return s["cluster_size"] == len(nodes)
def is_connected(s):
return s["connected"] == "ON"
def is_ready(s):
return s["ready"] == "ON"
print("")
print("%-20s %-15s %-6s %-12s %-7s" % (
"Node", "Status", "Size", "Connected", "Ready"))
for node in nodes:
s = status[node]
print("%-20s %-15s %-6s %-12s %-7s" % (
node, s["cluster_status"], s["cluster_size"],
s["connected"], s["ready"]))
print("")
print("Checking cluster status ...")
print("")
# ----------------------------------------------------------------------------
# CASE: All cluster nodes are up and running, green lights galore!
# ----------------------------------------------------------------------------
all_primary = all(map(is_primary, status.values()))
all_size_ok = all(map(has_correct_cluster_size, status.values()))
all_connected = all(map(is_connected, status.values()))
all_ready = all(map(is_ready, status.values()))
if all([all_primary, all_size_ok, all_connected, all_ready]):
print("There's no bootstrapping work to do here, all looks good!")
print("")
exit(0)
# ----------------------------------------------------------------------------
# CASE: The cluster is parially down, but some cluster hosts are still ok.
# ----------------------------------------------------------------------------
if any(map(is_primary, status.values())) and any(map(is_ready, status.values())):
print("The cluster seems to be in a degraded status.")
print("Please investigate the cluster status.")
print("- Can the cluster hosts reach each other over the network?")
print("- Are all mariadb instances running?")
print("")
print("It might help to (re)start the database server on the degraded node(s):")
print("")
commands = [
"lxc exec %s service mysql restart" % s["lxd_host"]
for s in status.values()
if not is_primary(s)
]
propose(commands, 1)
# ----------------------------------------------------------------------------
# CASE: All cluster nodes are down, one cluster node is safe to bootstrap.
# ----------------------------------------------------------------------------
print("None of the cluster hosts is operational. A cluster bootup is required.")
safe_to_bootstrap = [s for s in status.values() if s["safe_to_bootstrap"] == 1]
if any(safe_to_bootstrap):
bootstrap_node = safe_to_bootstrap[0]
print("A node is marked as 'safe to bootstrap', so proposed strategy:")
print("")
commands = ["lxc exec %s galera_new_cluster" % bootstrap_node["lxd_host"]]
for n, s in status.items():
if n == bootstrap_node["node"]:
continue
commands.append("lxc exec %s service mysql start" % s["lxd_host"])
propose(commands, 2)
# ----------------------------------------------------------------------------
# CASE: All cluster nodes are down, no cluster node is safe to bootstrap.
# ----------------------------------------------------------------------------
print("Unfortunately, none of the nodes is marked as safe to bootstrap.")
print("Retrieving last recovered position for all cluster nodes ...")
print("")
print("%-20s %-15s %-40s" % ("Node", "Recovery pos", "UUID"))
for n, s in status.items():
lxd_host = "%s:%s" % (lxd_status[n]["host"], n)
try:
result = subprocess.check_output([
"lxc", "exec", lxd_host, "/root/galera_wsrep_recovered_position"])
uuid_and_pos = json.loads(result)
uuid, pos = re.split(':', uuid_and_pos, maxsplit=1)
s["uuid"] = uuid
s["pos"] = int(pos)
except subprocess.CalledProcessError:
s["uuid"] = "Unknown"
s["pos"] = -1
print("%-20s %-15d %-40s" % (n, s["pos"], s["uuid"]))
uuids = set((s["uuid"] for s in status.values()))
if len(uuids) != 1:
print("")
print("Wow... now wait a minute... There are multiple UUID's in play!")
print("That should never happen in a Galera cluster.")
print("You will have to handle this one yourself I'm afraid.")
def get_pos_key(x):
return x["pos"]
old_to_new = sorted(status.itervalues(), key=get_pos_key)
bootstrap_node = old_to_new[-1]
print("")
print("Determined a node that is safe for bootstrapping, so proposed strategy:")
print("")
commands = [
"lxc exec %s /root/galera_flag_as_safe_to_bootstrap" % bootstrap_node["lxd_host"],
"lxc exec %s galera_new_cluster" % bootstrap_node["lxd_host"]
]
for n, s in status.items():
if n == bootstrap_node["node"]:
continue
commands.append("lxc exec %s service mysql start" % s["lxd_host"])
propose(commands, 3)