"""check_arcce_clean - Re-runs failed cleanup tasks."""

import asyncio
from datetime import datetime
from enum import Enum
import os
import random
import time
import sqlite3
from typing import Tuple
from arcnagios import arcutils, arcclients
from arcnagios.ce.jobutils import JobNagiosPlugin
from arcnagios.nagutils import OK, WARNING, CRITICAL, ServiceReport, ServiceOk
from arcnagios.utils import counted_adjectives, log_process_error, \
        Result, ResultOk, ResultError

# FIXME: This is a workaround for dealing with an overfull jobs file, in which
# case arcstat spends a bit over 60 seconds querying CEs, independent of the
# timeout, while we really only need to know the list of orphaned job IDs.  We
# could keep an independent list of job IDs or CEs (to limit the arcstat query)
# or a future version of arcstat could provide an option to disable the probing.
def read_jobs_file():
    db_path = os.path.join(os.getenv("HOME"), ".arc/jobs.dat")
    with sqlite3.connect(db_path) as db:
        for row in db.execute("SELECT id, localsubmissiontime, state FROM jobs"):
            submitted = \
                datetime.fromtimestamp(int(row[1])).strftime('%Y-%m-%d %H:%M:%S')
            state = arcutils.jobstate_of_str(row[2])
            stat = arcutils.Arcstat(
                state=state, specific_state="", submitted=submitted,
                job_error=None, exit_code=None)
            yield (row[0], stat)

PruneResult = Enum(
    "PruneResult",
    [("SKIPPED", 1), ("PRUNED", 2), ("FAILED", 3), ("TIMEDOUT", 4)])

class Check_arcce_clean(JobNagiosPlugin):

    def __init__(self):
        JobNagiosPlugin.__init__(self)
        argp = self.argparser.add_argument_group('Job Cleaner Options')
        argp.add_argument('--timeout', dest = 'timeout',
            type = int, default = 120,
            help = 'Overall timeout for probe, but currently does not limit '
                   'scheduled cleanup.')
        argp.add_argument('--max-age', dest = 'max_age',
            type = int, default = 604800,
            help = 'Max age before jobs info is cleaned.')
        argp.add_argument('--arcclean-timeout-min', type=int, default=20,
            help = 'Minimum timeout to pass to arcclean when approaching '
                   'the deadline before postponing the task.')
        argp.add_argument('--arcclean-timeout-max', type=int, default=20,
            help = 'Maximum timeout to pass to arcclean when enough time is '
                   'available.')
        argp.add_argument('--arcstat-timeout', dest = 'arcstat_timeout',
            type = int, default = 5, metavar = 'T',
            help = 'Passed to arcstat --timeout.')
        argp.add_argument('--bypass-arcstat',
            action = 'store_true', default = False,
            help = 'Read jobs directly from ~/.arc/jobs.dat instead of '
                   'invoking arcstat. This is a workaround to avoid '
                   'timeout if the job file has become too big, esp. when '
                   'there are many unavailable CEs.')
        argp.add_argument('-w', dest = 'warning_load',
            type = float, default = 10,
            help = 'Ratio of remaining work to processed work above which \
                    to issue a warning alert.')
        argp.add_argument('-c', dest = 'critical_load',
            type = float, default = 20,
            help = 'Ratio of remaining work to processed work above which \
                    to issue a critical alert.')
        self._t_start = time.time()

    def time_left(self) -> float:
        return self.opts.timeout - time.time() + self._t_start

    async def _prune_job(self, active_jobids, jobid, jobstat) -> PruneResult:
        if jobstat.submitted:
            tm_sub = time.strptime(jobstat.submitted, '%Y-%m-%d %H:%M:%S')
            t_sub = time.mktime(tm_sub)
            if self._t_start - t_sub <= self.opts.max_age:
                self.log.debug('Skipping too recent job %s.', jobid)
                return PruneResult.SKIPPED
        elif jobstat.state == arcutils.J_UNDEFINED and jobid in active_jobids:
            self.log.info('Skipping unavailable but active %s.', jobid)
            return PruneResult.SKIPPED

        async with self._arcclients_semaphore:
            t_left = self.time_left()
            if t_left < self.opts.arcclean_timeout_min:
                return PruneResult.TIMEDOUT
            arcclean_result = \
                await arcclients.arcclean(
                        jobid, force=True,
                        timeout=min(t_left, self.opts.arcclean_timeout_max),
                        log=self.log)
        if isinstance(arcclean_result, ResultError):
            synopsis = 'Failed to clean job %s (state %s, submitted %sZ).' \
                    % (jobid, jobstat.state, jobstat.submitted)
            log_process_error(self.log, arcclean_result.error,
                              synopsis=synopsis, prefix='arcclean')
            return PruneResult.FAILED
        self.log.info('Cleaned job %s (state %s, submitted %sZ).',
                      jobid, jobstat.state, jobstat.submitted)
        return PruneResult.PRUNED

    async def prune_jobs(self) -> Result[Tuple[int, int, int], Exception]:
        active_jobids = self.collect_active_jobids()

        t_left = self.time_left()
        if t_left < 1:
            self.log.warning('Timeout before querying jobs to prune.')
            return ResultError(RuntimeError('Timeout'))
        if self.opts.bypass_arcstat:
            jobstats = list(read_jobs_file())
        else:
            jobstats = list(
                arcutils.arcstat(
                    log=self.log,
                    timeout=min(t_left, self.opts.arcstat_timeout),
                    show_unavailable=True).items())
        random.shuffle(jobstats)

        tasks = [
            asyncio.create_task(self._prune_job(active_jobids, jobid, jobstat))
            for jobid, jobstat in jobstats
        ]
        results = list(await asyncio.gather(*tasks))
        pruned_count = sum(1 for r in results if r == PruneResult.PRUNED)
        failed_count = sum(1 for r in results if r == PruneResult.FAILED)
        rest_count   = sum(1 for r in results if r == PruneResult.TIMEDOUT)
        return ResultOk((pruned_count, failed_count, rest_count))

    def _check_load(self, load: float, msg: str) -> Tuple[int, str]:
        if load > self.opts.critical_load:
            msg += ', critical load!'
            return (CRITICAL, msg)
        elif load > self.opts.warning_load:
            msg += ', high load!'
            return (WARNING, msg)
        else:
            msg += '.'
            return (OK, msg)

    async def _check_async(self) -> ServiceReport:
        if not os.path.exists(self.top_workdir):
            self.log.info('The work directory is %s.', self.top_workdir)
            return ServiceOk('No jobs to clean since the working directory '
                             'has not yet been created.')
        self.require_voms_proxy()

        # Run scheduled work.
        coroutine = self.cleaner.run(
                timeout = self.time_left() * 2 / 3,
                semaphore = self._arcclients_semaphore)
        s_ok, s_retry, s_failed, s_postponed = await coroutine
        s_load = s_postponed / float(s_ok + s_failed + 1)
        s_msg = 'Sched: ' + counted_adjectives(
            [(s_ok, 'ok'),
             (s_retry, 'to retry'),
             (s_failed, 'failed'),
             (s_postponed, 'postponed')], if_empty = 'no work')
        s_service_state, s_msg = self._check_load(s_load, s_msg)

        # Prune ARC jobs if there is time.
        j_result = await self.prune_jobs()
        if j_result.is_ok():
            j_cleaned, j_failed, j_postponed = j_result.get() # pylint: disable=E1111
            j_load = j_postponed / float(j_cleaned + j_failed + 1)
            j_msg = 'Jobfile: ' + counted_adjectives(
                [(j_cleaned, 'cleaned'),
                 (j_failed, 'failed'),
                 (j_postponed, 'postponed')], if_empty = 'no work')
            j_service_state, j_msg = self._check_load(j_load, j_msg)
        else:
            j_service_state = CRITICAL
            j_msg = "No time left for checking ARC jobs."

        # Announce result.
        return ServiceReport(max(s_service_state, j_service_state),
                             s_msg + ' ' + j_msg)

    def check(self) -> ServiceReport:
        return asyncio.run(self._check_async())
