package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.cloud.config.ConfigserverConfig;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.TemporalAmount;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

/* loaded from: input_file:com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.class */
public class NodeFailer extends Maintainer {
    private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
    private static final Duration nodeRequestInterval = Duration.ofMinutes(10);
    private final HostLivenessTracker hostLivenessTracker;
    private final ServiceMonitor serviceMonitor;
    private final Deployer deployer;
    private final Duration downTimeLimit;
    private final Clock clock;
    private final Orchestrator orchestrator;
    private final Instant constructionTime;
    private final ThrottlePolicy throttlePolicy;
    private final Metric metric;
    private final ConfigserverConfig configserverConfig;

    /* loaded from: input_file:com/yahoo/vespa/hosted/provision/maintenance/NodeFailer$ThrottlePolicy.class */
    public enum ThrottlePolicy {
        hosted(Duration.ofDays(1), 0.01d, 2),
        disabled(Duration.ZERO, 0.0d, 0);

        public final Duration throttleWindow;
        public final double fractionAllowedToFail;
        public final int minimumAllowedToFail;

        ThrottlePolicy(Duration duration, double d, int i) {
            this.throttleWindow = duration;
            this.fractionAllowedToFail = d;
            this.minimumAllowedToFail = i;
        }

        public String toHumanReadableString() {
            return String.format("Max %.0f%% or %d nodes can fail over a period of %s", Double.valueOf(this.fractionAllowedToFail * 100.0d), Integer.valueOf(this.minimumAllowedToFail), this.throttleWindow);
        }
    }

    public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration duration, Clock clock, Orchestrator orchestrator, ThrottlePolicy throttlePolicy, Metric metric, JobControl jobControl, ConfigserverConfig configserverConfig) {
        super(nodeRepository, min(duration.dividedBy(2L), Duration.ofMinutes(5L)), jobControl);
        this.deployer = deployer;
        this.hostLivenessTracker = hostLivenessTracker;
        this.serviceMonitor = serviceMonitor;
        this.downTimeLimit = duration;
        this.clock = clock;
        this.orchestrator = orchestrator;
        this.constructionTime = clock.instant();
        this.throttlePolicy = throttlePolicy;
        this.metric = metric;
        this.configserverConfig = configserverConfig;
    }

    @Override // com.yahoo.vespa.hosted.provision.maintenance.Maintainer
    protected void maintain() {
        Mutex lockUnallocated = nodeRepository().lockUnallocated();
        Throwable th = null;
        try {
            updateNodeLivenessEventsForReadyNodes();
            getReadyNodesByFailureReason().forEach((node, str) -> {
                if (throttle(node)) {
                    return;
                }
                nodeRepository().fail(node.hostname(), Agent.system, str);
            });
            if (lockUnallocated != null) {
                if (0 != 0) {
                    try {
                        lockUnallocated.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                } else {
                    lockUnallocated.close();
                }
            }
            updateNodeDownState();
            getActiveNodesByFailureReason().forEach((node2, str2) -> {
                if (!failAllowedFor(node2.type()) || throttle(node2)) {
                    return;
                }
                failActive(node2, str2);
            });
        } catch (Throwable th3) {
            if (lockUnallocated != null) {
                if (0 != 0) {
                    try {
                        lockUnallocated.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    lockUnallocated.close();
                }
            }
            throw th3;
        }
    }

    private void updateNodeLivenessEventsForReadyNodes() {
        for (Node node : nodeRepository().getNodes(Node.State.ready)) {
            Optional lastRequestFrom = this.hostLivenessTracker.lastRequestFrom(node.hostname());
            if (lastRequestFrom.isPresent() && !node.history().hasEventAfter(History.Event.Type.requested, (Instant) lastRequestFrom.get())) {
                nodeRepository().write(node.with(node.history().with(new History.Event(History.Event.Type.requested, Agent.system, (Instant) lastRequestFrom.get()))));
            }
        }
    }

    private Map<Node, String> getReadyNodesByFailureReason() {
        Instant minus = this.constructionTime.isAfter(this.clock.instant().minus((TemporalAmount) nodeRequestInterval.multipliedBy(2L))) ? Instant.EPOCH : this.clock.instant().minus((TemporalAmount) this.downTimeLimit).minus((TemporalAmount) nodeRequestInterval);
        HashMap hashMap = new HashMap();
        for (Node node : nodeRepository().getNodes(Node.State.ready)) {
            if (expectConfigRequests(node) && !hasNodeRequestedConfigAfter(node, minus)) {
                hashMap.put(node, "Not receiving config requests from node");
            } else if (node.status().hardwareFailureDescription().isPresent()) {
                hashMap.put(node, "Node has hardware failure");
            } else if (node.status().hardwareDivergence().isPresent()) {
                hashMap.put(node, "Node has hardware divergence");
            }
        }
        return hashMap;
    }

    private void updateNodeDownState() {
        Map map = (Map) nodeRepository().getNodes(Node.State.active).stream().collect(Collectors.toMap((v0) -> {
            return v0.hostname();
        }, node -> {
            return node;
        }));
        this.serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostName, list) -> {
            Node node2 = (Node) map.get(hostName.s());
            if (node2 == null) {
                return;
            }
            if (badNode(list)) {
                recordAsDown(node2);
            } else {
                clearDownRecord(node2);
            }
        });
    }

    private Map<Node, String> getActiveNodesByFailureReason() {
        Instant minus = this.clock.instant().minus((TemporalAmount) this.downTimeLimit);
        HashMap hashMap = new HashMap();
        for (Node node : nodeRepository().getNodes(Node.State.active)) {
            if (node.history().hasEventBefore(History.Event.Type.down, minus) && !applicationSuspended(node)) {
                hashMap.put(node, "Node has been down longer than " + this.downTimeLimit);
            } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) {
                hashMap.put(node, "Node has hardware failure");
            }
        }
        return hashMap;
    }

    private boolean expectConfigRequests(Node node) {
        return !node.type().isDockerHost() || this.configserverConfig.nodeAdminInContainer();
    }

    private boolean hasNodeRequestedConfigAfter(Node node, Instant instant) {
        return !wasMadeReadyBefore(node, instant) || hasRecordedRequestAfter(node, instant);
    }

    private boolean wasMadeReadyBefore(Node node, Instant instant) {
        return node.history().hasEventBefore(History.Event.Type.readied, instant);
    }

    private boolean hasRecordedRequestAfter(Node node, Instant instant) {
        return node.history().hasEventAfter(History.Event.Type.requested, instant);
    }

    private boolean applicationSuspended(Node node) {
        try {
            return this.orchestrator.getApplicationInstanceStatus(node.allocation().get().owner()) == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN;
        } catch (ApplicationIdNotFoundException e) {
            return false;
        }
    }

    private boolean nodeSuspended(Node node) {
        try {
            return this.orchestrator.getNodeStatus(new HostName(node.hostname())) == HostStatus.ALLOWED_TO_BE_DOWN;
        } catch (HostNameNotFoundException e) {
            return false;
        }
    }

    private boolean failAllowedFor(NodeType nodeType) {
        return nodeType == NodeType.tenant || nodeType == NodeType.host || nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0;
    }

    public static boolean badNode(List<ServiceInstance> list) {
        Map map = (Map) list.stream().collect(Collectors.groupingBy((v0) -> {
            return v0.serviceStatus();
        }, Collectors.counting()));
        return ((Long) map.getOrDefault(ServiceStatus.UP, 0L)).longValue() <= 0 && ((Long) map.getOrDefault(ServiceStatus.DOWN, 0L)).longValue() > 0;
    }

    private Node recordAsDown(Node node) {
        if (node.history().event(History.Event.Type.down).isPresent()) {
            return node;
        }
        Mutex lock = nodeRepository().lock(node.allocation().get().owner());
        Throwable th = null;
        try {
            try {
                Node write = nodeRepository().write(nodeRepository().getNode(node.hostname(), Node.State.active).get().downAt(this.clock.instant()));
                if (lock != null) {
                    if (0 != 0) {
                        try {
                            lock.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        lock.close();
                    }
                }
                return write;
            } finally {
            }
        } catch (Throwable th3) {
            if (lock != null) {
                if (th != null) {
                    try {
                        lock.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    lock.close();
                }
            }
            throw th3;
        }
    }

    private void clearDownRecord(Node node) {
        if (node.history().event(History.Event.Type.down).isPresent()) {
            Mutex lock = nodeRepository().lock(node.allocation().get().owner());
            Throwable th = null;
            try {
                try {
                    nodeRepository().write(nodeRepository().getNode(node.hostname(), Node.State.active).get().up());
                    if (lock != null) {
                        if (0 == 0) {
                            lock.close();
                            return;
                        }
                        try {
                            lock.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    }
                } catch (Throwable th3) {
                    th = th3;
                    throw th3;
                }
            } catch (Throwable th4) {
                if (lock != null) {
                    if (th != null) {
                        try {
                            lock.close();
                        } catch (Throwable th5) {
                            th.addSuppressed(th5);
                        }
                    } else {
                        lock.close();
                    }
                }
                throw th4;
            }
        }
    }

    private boolean failActive(Node node, String str) {
        Optional deployFromLocalActive = this.deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30L));
        if (!deployFromLocalActive.isPresent()) {
            return false;
        }
        Mutex lock = nodeRepository().lock(node.allocation().get().owner());
        Throwable th = null;
        try {
            try {
                boolean z = true;
                String str2 = "Failing due to parent host " + node.hostname() + " failure: " + str;
                for (Node node2 : nodeRepository().getChildNodes(node.hostname())) {
                    if (node2.state() == Node.State.active) {
                        z &= failActive(node2, str2);
                    } else {
                        nodeRepository().fail(node2.hostname(), Agent.system, str2);
                    }
                }
                if (!z) {
                    if (lock != null) {
                        if (0 != 0) {
                            try {
                                lock.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            lock.close();
                        }
                    }
                    return false;
                }
                Node fail = nodeRepository().fail(node.hostname(), Agent.system, str);
                try {
                    ((Deployment) deployFromLocalActive.get()).activate();
                    if (lock != null) {
                        if (0 != 0) {
                            try {
                                lock.close();
                            } catch (Throwable th3) {
                                th.addSuppressed(th3);
                            }
                        } else {
                            lock.close();
                        }
                    }
                    return true;
                } catch (RuntimeException e) {
                    nodeRepository().reactivate(fail.hostname(), Agent.system, "Failed to redeploy after being failed by NodeFailer");
                    log.log(Level.WARNING, "Attempted to fail " + fail + " for " + fail.allocation().get().owner() + ", but redeploying without the node failed", (Throwable) e);
                    if (lock != null) {
                        if (0 != 0) {
                            try {
                                lock.close();
                            } catch (Throwable th4) {
                                th.addSuppressed(th4);
                            }
                        } else {
                            lock.close();
                        }
                    }
                    return false;
                }
            } finally {
            }
        } catch (Throwable th5) {
            if (lock != null) {
                if (th != null) {
                    try {
                        lock.close();
                    } catch (Throwable th6) {
                        th.addSuppressed(th6);
                    }
                } else {
                    lock.close();
                }
            }
            throw th5;
        }
    }

    private boolean throttle(Node node) {
        if (this.throttlePolicy == ThrottlePolicy.disabled) {
            return false;
        }
        Instant minus = this.clock.instant().minus((TemporalAmount) this.throttlePolicy.throttleWindow);
        long count = nodeRepository().getNodes(new Node.State[0]).stream().filter(node2 -> {
            return node2.history().hasEventAfter(History.Event.Type.failed, minus);
        }).count();
        int max = (int) Math.max(r0.size() * this.throttlePolicy.fractionAllowedToFail, this.throttlePolicy.minimumAllowedToFail);
        boolean z = ((long) max) < count || (((long) max) == count && !node.type().isDockerHost());
        if (z) {
            log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), this.throttlePolicy.toHumanReadableString()));
        }
        this.metric.set("nodeFailThrottling", Integer.valueOf(z ? 1 : 0), (Metric.Context) null);
        return z;
    }
}
