<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
     which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
     There has to be one entity for each item to be referenced. 
     An alternate method (rfc include) is described in the references. -->

<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY IPFRR SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-rtgwg-ipfrr-framework.xml">
<!ENTITY MICROLOOP SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-rtgwg-lf-conv-frmwk.xml">
<!ENTITY OFIB SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-rtgwg-ordered-fib.xml">
<!ENTITY PLSN SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-rtgwg-microloop-analysis.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
     please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
     (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="no"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
     (using these PIs as follows is recommended by the RFC Editor) -->
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<?rfc compact="yes"?>
<rfc category="info" docName="draft-bryant-francois-shand-ipfrr-aah-00.txt"
     ipr="full3978" updates="">
  <!-- category values: std, bcp, info, exp, and historic
     ipr values: full3667, noModification3667, noDerivatives3667
     you can add the attributes updates="NNNN" and obsoletes="NNNN" 
     they will automatically be output with "(if approved)" -->

  <!-- ***** FRONT MATTER ***** -->

  <front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the 
         full title is longer than 39 characters -->

    <title abbrev="Abandon All Hope (AAH)">Mechanisms for safely abandoning
    loop-free convergence (AAH)</title>

    <!-- add 'role="editor"' below for the editors if appropriate -->

    <!-- Another author who claims to be an editor -->

    <author fullname="Mike Shand" initials="M." surname="Shand">
      <organization>Cisco Systems</organization>

      <address>
        <postal>
          <street>250, Longwater Avenue.</street>

          <!-- Reorder these if your country does things differently -->

          <city>Reading</city>

          <region>Berks</region>

          <code>RG2 6GB</code>

          <country>UK</country>
        </postal>

        <email>mshand@cisco.com</email>

        <!-- uri and facsimile elements may also be added -->
      </address>
    </author>

    <author fullname="Stewart Bryant" initials="S." surname="Bryant">
      <organization>Cisco Systems</organization>

      <address>
        <postal>
          <street>250, Longwater Avenue.</street>

          <city>Reading</city>

          <region>Berks</region>

          <code>RG2 6GB</code>

          <country>UK</country>
        </postal>

        <email>stbryant@cisco.com</email>
      </address>
    </author>

    <author fullname="Pierre Francois" initials="P." surname="Francois">
      <organization>Universite catholique de Louvain</organization>

      <address>
        <postal>
          <street></street>

          <city></city>

          <region></region>

          <code></code>

          <country></country>
        </postal>

        <email>pierre.francois@uclouvain.be</email>

        <uri>http://inl.info.ucl.ac.be/pfr</uri>
      </address>
    </author>

    <date year="2008" />

    <!-- If the month and year are both specified and are the current ones, xml2rfc will fill 
         in the current day for you. If only the current year is specified, xml2rfc will fill 
	 in the current day and month for you. If the year is not the current one, it is 
	 necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the 
	 purpose of calculating the expiry date).  With drafts it is normally sufficient to 
	 specify just the year. -->

    <!-- Meta-data Declarations -->

    <!-- WG name at the upperleft corner of the doc,
         IETF is fine for individual submissions.  
	 If this element is not present, the default is "Network Working Group",
         which is used by the RFC Editor as a nod to the history of the IETF. -->

    <!-- Keywords will be incorporated into HTML output
         files in a meta tag but they have no effect on text or nroff
         output. If you submit your draft to the RFC Editor, the
         keywords will be used for the search engine. -->

    <abstract>
      <t>IPFRR and loop-free convergence techniques can deal with single
      topology change events, multiple correlated change events, and in some
      cases even certain uncorrelated events. However, in all cases there are
      events which cannot be dealt with and the mechanism needs to quickly
      revert to normal convergence. This is known as &ldquo;Abandoning All
      Hope&rdquo; (AAH). This document describes the nature of the problem,
      and various proposed mechanisms to deal with it.</t>
    </abstract>
  </front>

  <middle>
    <section title="Conventions used in this document">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
      document are to be interpreted as described in RFC 2119 <xref
      target="RFC2119"></xref>.</t>

      <t></t>
    </section>

    <section title="Introduction">
      <t>IPFRR<xref target="I-D.ietf-rtgwg-ipfrr-framework"></xref> and
      loop-free convergence techniques<xref
      target="I-D.ietf-rtgwg-lf-conv-frmwk"></xref> can deal with single
      topology change events, multiple correlated change events, and in some
      cases even certain uncorrelated events. However, in all cases there are
      events which cannot be dealt with and the mechanism needs to quickly
      revert to normal convergence. This is known as &ldquo;Abandoning All
      Hope&rdquo; (AAH).</t>

      <t>A good example is the case of the ordered FIB loop-free convergence
      technique (oFIB)<xref target="I-D.ietf-rtgwg-ordered-fib"></xref>,
      however the problem and the mechanisms described here for its resolution
      are equally applicable to any loop free convergence mechanism, such as
      PLSN<xref target="I-D.ietf-rtgwg-microloop-analysis"></xref>. All the
      routers performing the calculation must have an identical view of the
      set of topology changes under consideration. One technique to ensure
      this is to start a hold-down timer on reception of the first event in
      the hope that all subsequent events related to the same root cause will
      arrive before the timer expires. If this is the case, then all routers
      in the network will have acquired an identical set of changes and
      processing can continue correctly. However, in some cases the timer
      value will be too short to ensure that all the related events have
      arrived at all routers (perhaps because there was some unexpected
      propagation delay, or one or more of the events are slow in being
      detected). In other cases, a completely unrelated event may occur after
      the timer has expired, but before the processing is complete. In either
      case it is necessary to &ldquo;Abandon all Hope&rdquo; and revert to
      traditional convergence.</t>

      <t>There are a number of problems with this na&iuml;ve approach.
      Firstly, since the timer is started at each router on reception of the
      first LSP announcing a topology change, the actual starting time is
      dependant upon the propagation time of the first LSP. So, for a
      subsequent event occurring around the time of the timer expiry, because
      of variations in propagation delay it may reach some routers before the
      timer expires and others after it has expired. In the former case this
      LSP will be included in the set of changes to be considered, while in
      the latter it will be excluded and would invoke an AAH in the routers
      receiving it. Clearly this would be a dangerous condition, and it is
      therefore necessary to arrange that an AAH invoked anywhere in the
      network causes ALL routers to AAH. This can be achieved by reliably
      propagating an AAH message throughout the network. However, this raises
      a second problem, the need to synchronize the exit from AAH state
      throughout the network.</t>

      <t>While in AAH state any topology changes previously received, or which
      are subsequently received, should be processed immediately using the
      traditional convergence algorithms i.e. without invoking controlled
      convergence. If the exit from the AAH state is not correctly
      synchronized, a new event may be processed by some routers immediately
      (as AAH), while those which have already left AAH state will treat it as
      the first of a new batch of changes and attempt controlled
      convergence.</t>
    </section>

    <section title="Possible Solutions">
      <t>A number of approaches to this problem have been proposed, in
      increasing order of complexity: <list style="numbers">
          <t>Hold-down timer only. This is the solution proposed in PLSN.</t>

          <t>Basic per event AAH messages</t>

          <t>Synchronization of AAH state using AAH messages.</t>
        </list></t>

      <t>These are described below. The purpose of this draft is to trigger
      discussion on the trade-offs between complexity and robustness in the
      AAH solution-space.<list style="symbols">
          <t></t>
        </list></t>

      <t></t>

      <section title="Hold-down timer only"></section>

      <t>This method uses a hold-down to acquire a set of LSPs which should be
      processed together. On expiry of the local hold-down timer, the router
      begins processing the batch of LSPs according to the loop free
      prevention algorithm.</t>

      <section title="Basic per event AAH messages ">
        <t>This method uses signaling between neighbors to announce the
        abandoning of controlled convergence.</t>

        <t>A router individually decides when it should abandon controlled
        convergence for a given (set of) LSP(s). It bases this decision on the
        LSP reception timings and the hold down timers defined for the
        controlled convergence mechanism used.</t>

        <t>When a router makes a decision to abandon controlled convergence
        for an LSP, it sends an AAH message to a selected subset of its
        neighbors. The message identifies the LSPs for which controlled
        convergence was abandoned.</t>

        <t>The reception of such a message MUST trigger the decision to
        abandon controlled convergence for this LSP by the receiver. The
        receiver SHOULD also abandon controlled convergence for the other
        pending LSPs.</t>

        <t>A router is only allowed to send AAH messages for a given event
        once. This can be achieved for example with a one bit flag in the LSP
        of the LSDB, stating whether convergence has been abandoned and
        signaled for this LSP. This can also be achieved by storing the
        identification of the LSPs for which convergence was abandoned for a
        time that is an order of magnitude longer than a typical IGP
        convergence (i.e., 10 seconds). The subsest of neighbors to which an
        AAH message must be sent by a router R depends on the controlled
        convergence mechanism. It can be equal to all the neighbors of R, but
        not necessarily.</t>

        <t>For any controlled convergence mechanism, the selection of this
        subset MUST be such that if a router R abandons controlled
        convergence, all the routers who could create a forwarding loop with R
        by not abandoning controlled convergence will eventually abandon
        controlled convergence.</t>

        <t>For the case of controlled convergence using ordered-FIB :<list
            style="symbols">
            <t>In the case of a link up / node up / metric decrease event, the
            set MUST include the neighbors of R that are on the shortest paths
            between R and the originator of the LSP for which controlled
            convergence is abandoned.</t>

            <t>In the case of a link down / node down / metric increase event,
            the set MUST include the neighbors of R that are upstream of R on
            the paths towards the originator of the LSP for which controlled
            convergence is abandoned.</t>
          </list></t>
      </section>

      <section title="AAH messages">
        <t>Like the others, this method uses a hold-down to acquire a set of
        LSPs which should be processed together. On expiry of the local
        hold-down timer, the router begins processing the batch of LSPs
        according to the loop free prevention algorithm. This is the same
        behaviour as the hold-down timer only method. However, if any router,
        having started the loop-free convergence process receives an LSP which
        would trigger a topology change, it locally abandons the controlled
        convergence process, and sends an AAH message to all its neighbors.
        This eventually triggers all routers to abandon the controlled
        convergence. The routers remain in AAH state (i.e. processing topology
        changes using normal "fast" convergence), until a period of quiescence
        has elapsed. The exit from AAH state is synchronized by using a two
        step process.</t>

        <t>To achieve the required synchronization, two additional messages
        are required, AAH and AAH ACK. The AAH message is reliably exchanged
        between neighbours using the AAH ACK message. These could be
        implemented as a new message within the routing protocol or carried in
        existing routing hello messages.</t>

        <t>Two types of state machines are needed. A per-router AAH state
        machine and a per neighbour AAH state machine(PNSM). These are
        described below.</t>

        <section title="Per Router State Machine">
          <t></t>

          <t></t>

          <figure>
            <preamble>Per Router State Table</preamble>

            <artwork><![CDATA[
  +-------------+-----------+---------+--------+------------+----------+
  | EVENT       |     Q     |   Hold  |   CC   |     AAH    | AAH-hold |
  +=============+===========+=========+========+============+==========+
  | RX LSP      |   Start   |    -    | TX-AAH |  Re-start  |  TX-AAH  |
  | triggering  | hold-down |         | Start  | AAH timer. |   Start  |
  | change      |   timer   |         |  AAH   |    [AAH]   |    AAH   |
  |             |   [Hold]  |         | timer. |            |   timer. |
  |             |           |         | [AAH]  |            |   [AAH]  |
  +-------------+-----------+---------+--------+------------+----------+
  | RX AAH      |   TX-AAH  |  TX-AAH | TX-AAH |    [AAH]   |  TX-AAH  |
  | (Neighbor's | Start AAH |  Start  | Start  |            |   Start  |
  |  PNSM       |   timer.  |   AAH   |  AAH   |            |    AAH   |
  |  processes  |   [AAH]   |  timer  | timer. |            |   timer. |
  |  RX AAH.)   |           |  [AAH]  | [AAH]  |            |   [AAH]  |
  +-------------+-----------+---------+--------+------------+----------+
  | Timer       |     -     | Trigger |    -   |    Start   |    [Q]   |
  | expiry      |           |   CC.   |        |  AAH-hold  |          |
  |             |           |  [CC]   |        |   timer.   |          |
  |             |           |         |        | [AAH-hold] |          |
  +-------------+-----------+---------+--------+------------+----------+
  | Controlled  |     -     |    -    |   [Q]  |      -     |     -    |
  | convergence |           |         |        |            |          |
  | completed   |           |         |        |            |          |
  +-------------+-----------+---------+--------+------------+----------+]]></artwork>

            <postamble>TX-AAH = Send &ldquo;goto TX-AAH&rdquo; to all other
            PNSMs.</postamble>
          </figure>

          <t></t>

          <t>Operation of the per-router state machine is as follows:</t>

          <t>Operation of this state machine under normal topology change
          involves only states: Quiescent (Q), Hold-down (Hold) and Controlled
          Convergence (CC). The remaining states are associated with an AAH
          event.</t>

          <t>The resting state is Quiescent. When the router in the Quiescent
          state receives an LSP indicating a topology change, which would
          normally trigger an SPF, it starts the Hold-down timer and changes
          state to Hold-down. It normally remains in this state, collecting
          additional LSPs until the Hold-down timer expires. Note that all
          routers MUST use a common value for the Hold-down timer. When the
          Hold-down timer expires the router then enters Controlled
          Convergence (CC) state and executes the CC mechanism to re-converge
          the topology. When the CC process has completed on the router, the
          router re-enters the Quiescent state.</t>

          <t>If this router receives a topology changing LSP whilst it is in
          the CC state, it enters AAH state, and sends a &ldquo;goto
          TX-AAH&rdquo; command to all per neighbour state machines which
          causes each per-neighbour state machine to signal this state change
          to its neighbour. Alternatively, if this router receives an AAH
          message from any of its neighbors whilst in any state except AAH, it
          starts the AAH timer and enters the AAH state. The per neighbor
          state machine corresponding to the neighbor from which the AAH was
          received executes the RX AAH action (which causes it to send an AAH
          ACK), while the remainder are sent the &ldquo;goto TX-AAH&rdquo;
          command. The result is that the AAH is acknowledged to the neighbor
          from which it was received and propagated to all other neighbors. On
          entering AAH state, all CC timers are expired and normal convergence
          takes place.</t>

          <t>Whilst in the AAH state, LSPs are processed in the traditional
          manner. Each time an LSP is received, the AAH timer is restarted. In
          an unstable network ALL routers will remain in this state for some
          time and the network will behave in the traditional uncontrolled
          convergence manner.</t>

          <t>When the AAH timer expires, the router enters AAH-hold state and
          starts the AAH hold timer. The purpose of the AAH-hold state is to
          synchronize the transition of the network from AAH to Quiescent. The
          additional state ensures that the network cannot contain a mixture
          of routers in both AAH and Quiescent states. If, whilst in AAH-Hold
          state the router receives a topology changing LSP, it re-enters AAH
          state and commands all per neighbour state machines to &ldquo;goto
          TX-AAH&rdquo;. If, whilst in AAH-Hold state the router receives an
          AAH message from one of its neighbours, it re-enters the AAH state
          and commands all other per neighbour state machines to &ldquo;goto
          TX-AAH&rdquo;. Note that the per-neighbor state machine receiving
          the AAH message will autonomously acknowledge receipt of the AAH
          message. Commanding the per-neighbour state machine to &ldquo;goto
          TX-AAH&rdquo; is necessary, because routers may be in a mixture of
          Quiescent, Hold-down and AAH-hold state, and it is necessary to
          rendezvous the entire network back to AAH state.</t>

          <t>When the AAH Hold timer expires the router changes to state
          Quiescent and is ready for loop free convergence.</t>
        </section>

        <section title="Per Neighbor State Machine">
          <t></t>

          <figure>
            <preamble>Per Neighbor State Table</preamble>

            <artwork><![CDATA[
  +----------------------------+--------------+------------------------+
  | EVENT                      | Idle         | TX-AAH                 |
  +============================+==============+========================+
  | RX AAH                     | Send ACK.    | Send ACK.              |
  |                            |              | Cancel timer.          |
  |                            | [IDLE]       | [IDLE]                 |
  +----------------------------+--------------+------------------------+
  | RX ACK                     | ignore       | Cancel timer.          |
  |                            |              | [IDLE]                 |
  +----------------------------+--------------+------------------------+
  | RX "goto TX-AAH" from      | Send AAH     | ignore                 |
  | Router State Machine       | [TX-AAH]     |                        |
  +----------------------------+--------------+------------------------+
  | Timer expires              | impossible   | Send AAH               |
  |                            |              | Restart timer.         |
  |                            |              | [TX-AAH]               |
  +----------------------------+--------------+------------------------+]]></artwork>

            <postamble></postamble>
          </figure>

          <t></t>

          <t>There is one instance of the per-neighbour (PN) state machine for
          each neighbour within the convergence control domain.</t>

          <t>The normal state is IDLE.</t>

          <t>On command (&ldquo;goto TX-AAH&rdquo;) from the router state
          machine, the state machine enters TX-AAH state, transmits an AAH
          message to its neighbour and starts a timer.</t>

          <t>On receipt of an AAH ACK in state TX-AAH the state machine
          cancels the timer and enters IDLE state.</t>

          <t>In states IDLE, any AAH ACK message received is ignored.</t>

          <t>On expiry of the timer in state TX-AAH the state machine
          transmits an AAH message to the neighbour and restarts the timer.
          (The timer cannot expire in any other state.)</t>

          <t>In any state, receipt of an AAH causes the state machine to
          transmit an AAH ACK and enter the IDLE state.</t>

          <t>Note that for correct operation the state machine MUST remain in
          state TX-AAH, until an AAH ACK or an AAH is received, or the state
          machine is deleted. Deletion of the per neighbor state machine
          occurs when routing determines that the neighbour has gone away, or
          when the interface goes away.</t>

          <t>When routing detects a new neighbour it creates a new instance of
          the per-neighbour state machine in state Idle. The consequent
          generation of the router&rsquo;s own LSP will then cause the router
          state machine to execute the LSP receipt actions, which will if
          necessary result in the new per-neighbour state machine receiving a
          &ldquo;goto TX-AAH&rdquo; command and transitioning to TX-AAH
          state.</t>
        </section>
      </section>
    </section>

    <section title="Management Considerations ">
      <t>The management requirements will depend upon the solution adopted,
      but at the very least there needs to be reporting of the current
      state.</t>
    </section>

    <section title="Scope and applicability ">
      <t></t>

      <t>The initial scope of this work is in the context of link state
      IGPs.</t>
    </section>

    <section title="IANA Considerations">
      <t>There are no IANA considerations that arise from this document.</t>
    </section>

    <section title="Security Considerations">
      <t>This document does not itself introduce any security issues, but
      attention must be paid to the security implications of any proposed
      solutions to the problem.</t>
    </section>

    <section title="Acknowledgements">
      <t>The authors would like to acknowledge contributions made by Les
      Ginsberg.</t>
    </section>

    <!-- -->
  </middle>

  <!--  *****BACK MATTER ***** -->

  <back>
    <!-- References split into informative and normative -->

    <!-- There are 2 ways to insert reference entries from the citation libraries:
     1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
     2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
        (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

     Both are cited textually in the same manner: by using xref elements.
     If you use the PI option, xml2rfc will, by default, try to find included files in the same
     directory as the including file. You can also define the XML_LIBRARY environment variable
     with a value containing a set of directories to search.  These can be either in the local
     filing system or remote ones accessed by http (http://domain/dir/... ).-->

    <references title="Normative References">
      &RFC2119;
    </references>

    <references title="Informative References">
      &IPFRR;

      &MICROLOOP;

      &OFIB;

      &PLSN;
    </references>
  </back>
</rfc>