<?xml version="1.0" encoding="us-ascii"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
     which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
     There has to be one entity for each item to be referenced.
     An alternate method (rfc include) is described in the references. -->

<!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
<!ENTITY RFC3552 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3552.xml">
<!ENTITY I-D.narten-iana-considerations-rfc2434bis SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.narten-iana-considerations-rfc2434bis.xml">
<!ENTITY RFC4821 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4821.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs),
     please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
     (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
     (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="info" docName="draft-v6ops-pmtud-ecmp-problem-00" ipr="trust200902">
  <!-- category values: std, bcp, info, exp, and historic
     ipr values: full3667, noModification3667, noDerivatives3667
     you can add the attributes updates="NNNN" and obsoletes="NNNN"
     they will automatically be output with "(if approved)" -->

  <!-- ***** FRONT MATTER ***** -->

  <front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the
         full title is longer than 39 characters -->

    <title abbrev="misses with ICMPv6 PTB">Close encounters of the ICMP type 2 kind (near misses with ICMPv6 PTB)</title>

    <!-- add 'role="editor"' below for the editors if appropriate -->

    <!-- Another author who claims to be an editor -->

    <author fullname="Matt Byerly" initials="M." surname="Byerly">
      <organization>Fastly</organization>

      <address>
        <postal>
          <street></street>

          <!-- Reorder these if your country does things differently -->

          <city>Kapolei</city>

          <region>HI</region>

          <code></code>

          <country>US</country>
        </postal>

        <phone></phone>

        <email>mbyerly@zynga.com</email>

        <!-- uri and facsimile elements may also be added -->
      </address>
    </author>

    <author fullname="Matt Hite" initials="M." surname="Hite">
      <organization>Evernote</organization>

      <address>
        <postal>
          <street></street>

          <!-- Reorder these if your country does things differently -->

          <city>Redwood City</city>

          <region>CA</region>

          <code></code>

          <country>US</country>
        </postal>

        <phone></phone>

        <email>mhite@hotmail.com</email>

   </address>
  </author>


    <author fullname="Joel Jaeggli" initials="J." surname="Jaeggli">
      <organization>Fastly</organization>

      <address>
        <postal>
          <street></street>

          <!-- Reorder these if your country does things differently -->

          <city>Mountain View</city>

          <region>CA</region>

          <code></code>

          <country>US</country>
        </postal>

        <phone></phone>

        <email>joelja@gmail.com</email>

        <!-- uri and facsimile elements may also be added -->
      </address>
    </author>

    <date year="2014" />

    <!-- If the month and year are both specified and are the current ones, xml2rfc will fill
         in the current day for you. If only the current year is specified, xml2rfc will fill
	 in the current day and month for you. If the year is not the current one, it is
	 necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the
	 purpose of calculating the expiry date).  With drafts it is normally sufficient to
	 specify just the year. -->

    <!-- Meta-data Declarations -->

    <area>Operations</area>

    <workgroup>v6ops</workgroup>

    <!-- WG name at the upperleft corner of the doc,
         IETF is fine for individual submissions.
	 If this element is not present, the default is "Network Working Group",
         which is used by the RFC Editor as a nod to the history of the IETF. -->

    <keyword>ipv6 icmp6 icmpv6 type 2 ptb</keyword>

    <!-- Keywords will be incorporated into HTML output
         files in a meta tag but they have no effect on text or nroff
         output. If you submit your draft to the RFC Editor, the
         keywords will be used for the search engine. -->

    <abstract>
      <t>This document calls attention to the problem of delivering ICMPv6 type 2 "Packet Too Big" (PTB) messages to intended destinations in ECMP load balanced, anycast network architectures. It discusses operational mitigations that can address this class of failure.</t>
    </abstract>
  </front>

  <middle>
    <section title="Introduction">
      <t>Operators of popular Internet services face unique challenges associated with scaling their infrastructure. One approach is to utilize equal-cost multi-path (ECMP) routing to perform stateless distribution of incoming TCP or UDP sessions to multiple servers or middle boxes such as load balancers. Distribution of traffic in this manner presents a problem when dealing with ICMP signaling. Specifically, an ICMP error is not guaranteed to hash via ECMP to the same destination as its corresponding TCP or UDP session. A case where this is particularly problematic operationally is path MTU discovery (PMTUD).</t>

<!--
      <section title="Requirements Language">
        <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
        "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
        document are to be interpreted as described in <xref
        target="RFC2119">RFC 2119</xref>.</t>
      </section> -->
    </section>

    <section title="Problem">
      <t>
        A common application for stateless load balancing of TCP or UDP flows is to perform an initial subdivision of flows in front of a stateful load balancer tier or multiple servers so that the workload becomes divided into manageable fractions of the total number of flows. The flow division is performed using ECMP forwarding and a stateless but sticky algorithm for hashing  across the available paths. This nexthop selection for the purposes of flow distribution is a constrained form of anycast d where all anycast destinations are equidistant topologically from the upstream router responsible for making the last next-hop forwarding decision before the flow arrives on the destination device. In this approach, the hash is performed across some set of available protocol headers. Typically, these headers may include (IPv6)Flow-Label, IP-source, IP-destination, protocol, source-port, destination-port and potentially others such as ingress interface.
      </t>

      <t>
        A problem common to this approach of distribution through hashing is impact on path MTU discovery. An ICMPv6 type 2 PTB message generated on an intermediate device for a packet sent from an ECMP load balanced server to a client, will have the load-balanced anycast address as the destination and will be statelessly load balanced to one of the anycast servers. While the ICMPv6 PTB message contains as much of the packet that could not be forwarded as possible, the payload headers do not factor into the forwarding decision and are ignored. Because the PTB message is not identifiable as part of the original flow by the packet header the results of the ICMPv6 ECMP hash are unlikely to be hashed to the same nexthop as packets matching TCP or UDP ECMP hash.
      </t>

      <t>
      An example packet flow and topology follow.
      </t>

<figure align="center" anchor="nexthop">
        <preamble></preamble>

        <artwork align="left"><![CDATA[

ptb -> router ecmp -> nexthop L4/L7 load balancer -> destination

  router --> load balancer 1 --->
       \\--> load balancer 2 ---> load-balanced service
        \--> load balancer N --->

            ]]></artwork>

        <postamble> </postamble>
      </figure>

<t>
The router ECMP decision is used because it is part of the forwarding architecture, can be performed at line rate, and does not depend on shared state or coordination across a distributed forwarding architecture which may include multiple linecards or routers. The ECMP routing decision is deterministic with respect to packets having the same computed hash.
</t>

<t>
  The typical case where ICMPv6 PTB messages are received at the load balancer is a case where the path MTU from the client to the load balancer is limited by a tunnel in which the client itself is not aware of. In the common case of a TCP flow where TLS is employed, the first packet that is likely to exceed a tunnel MTU lower than that specified by the MSS on the client and the load balancer/server is the TLS ServerHello and certificate.
</t>

<t>
  Direct experience says that the frequency of PTB messages is small compared to total flows. One possible conclusion being that tunneled IPv6 deployments that cannot carry 1500 mtu packets are relatively rare. Techniques employed by clients such as happy-eyeballs may actually contribute some amelioration to the IPv6 client experience by preferring IPv4 in cases that might be identified as slow or failed. Still, the expectation of operators is that PMTUD should work and that unnecessary breakage of client traffic should be avoided.
</t>

<t>
  A final observation regarding server tuning is that it is typically not possible even if it is potentially desirable to be able to independently set the TCP MSS for different address families on end-systems.
</t>

  <t>The problem as described does also impact IPv4; however, the ability to fragment on wire at tunnel ingress points and the relative rarity of sub-1500 byte MTUs that are not coupled to changes in client behavior (for example, endpoint VPN clients set the tunnel interface MTU accordingly for performance reasons) makes the problem sufficiently rare that some deployments simply choose to ignore it.
  </t>

</section>

  <section title="Mitigation">
    <t>
      Mitigation of the potential for PTB messages to be mis-delivered involves ensuring that an ICMPv6 error message is distributed to the same anycast server responsible for the flow for which the error is generated. Ideally Mitigation could be done by the mechanism hosts use to identify the flow, by looking into the payload of the ICMPv6 message (to determine which TCP flow it was associated with) before making a forwarding decision. Because the encapsulated IP header occurs at a fixed offset in the icmp message it is not outside the realm of possibility that routers with sufficient header processing capability could parse that far into the payload. Employing a mediation device that handles the parsing and distribution of PTB messages after policy routing or on each load-balancer/server is a possibility. 
    </t>

    <t>
      Another mitigation approach is predicated upon distributing the PTB message to all anycast servers under the assumption that the one for which the message was intended will be able to match it to the flow and update the route cache with the new MTU, devices not able to match the flow will discard these packets. Such distribution has potentially significant implications for resource consumption and the potential for self-inflicted denial-of-service if not carefully employed. Fortunately, in real-world-deployment we have observed that, the number of flows for which this problem occurs is relatively small (example, 10 or fewer pps on 1Gb/s or more worth of https traffic) and sensible ingress rate limiters which will discard excessive message volume can be applied to protect even very large anycast server tiers with the potential for fallout only under circumstances of deliberate duress.
    </t>
<section title="Alternatives">
<t>
  As an alternative, it may be appropriate to lower the TCP MSS to 1220 in order to accommodate 1280 byte MTU. We consider this undesirable as hosts may not be able to independently set TCP MSS by address-family thereby impacting IPv4, or alternatively that it relies on a middle-box to clamp the MSS independently from the end-systems.
</t>
</section>
    <section title="Implementation">
    <t>
      <list style="numbers">
        <t>
          Filter-based-forwarding matches next-header ICMPv6 type-2 and matches a next-hop on a particular subnet directly attached to both border routers. The filter is policed to reasonable limits (we chose 1000pps).
        </t>

        <t>
          Filter is applied on input side of all external interfaces
        </t>

        <t>
          A proxy located at the next-hop forwards ICMPv6 type-2 packets received at the next-hop to an Ethernet broadcast address (example ff:ff:ff:ff:ff:ff) on all specified subnets. This was necessitated by router inability (in IPv6) to forward the same packet to multiple unicast next-hops.
        </t>

        <t>
          Anycast servers receive the PTB error and process packet as needed.
        </t>
      </list>
    </t>
    <t>
      A simple Python scapy script that can perform the ICMPv6 proxy reflection is included.
    </t>

<figure>
<artwork><![CDATA[
      #!/usr/bin/python

      from scapy.all import *

      IFACE_OUT = ["p2p1", "p2p2"]

      def icmp6_callback(pkt):
          if pkt.haslayer(IPv6) and (ICMPv6PacketTooBig in pkt) \
          and pkt[Ether].dst != 'ff:ff:ff:ff:ff:ff':
              del(pkt[Ether].src)
              pkt[Ether].dst = 'ff:ff:ff:ff:ff:ff'
              pkt.show()
              for iface in IFACE_OUT:
                  sendp(pkt, iface=iface)

      def main():
          sniff(prn=icmp6_callback, filter="icmp6 \
          and (ip6[40+0] == 2)", store=0)

      if __name__ == '__main__':
          main()
 ]]></artwork>
</figure>

    <t>
      This example script listens on all interfaces for IPv6 PTB errors being forwarded using filter-based-forwarding. It removes the existing Ethernet source and rewrites a new Ethernet destination of the Ethernet broadcast address. It then sends the resulting frame out the p2p1 and p2p2 interfaces where our anycast servers reside.
    </t>
    </section>

  </section>

  <section title="Improvements">
    <t>There are several ways that improvements could be made to the situation with respect to ECMP load balancing of ICMPv6 PTB.
    </t>

    <t>
      <list style="numbers">
        <t>Routers with sufficient capacity within the lookup process could parse all the way through the L3 or L4 header in the ICMPv6 payload beginning at bit offset 32 of the ICMP header. By reordering the elements of the hash to match the inward direction of the flow, the PTB error could be directed to the same next-hop as the incoming packets in the flow.
        </t>

        <t>The FIB could be programmed with a multicast distribution tree that included all of the necessary next-hops.
        </t>

        <t>Ubiquitous implementation of <xref target="RFC4821">RFC 4821</xref> Packetization Layer Path MTU Discovery would probably go a long way towards reducing dependence on ICMPv6 PTB.</t>
      </list>
      </t>

</section>


    <section anchor="Acknowledgements" title="Acknowledgements">
      <t>The authors would like to thank Mark Andrews, Brian Carpenter, Nick Hilliard and Ray Hunter, for review.</t>
    </section>

    <!-- Possibly a 'Contributors' section ... -->

    <section anchor="IANA" title="IANA Considerations">
      <t>This memo includes no request to IANA.</t>
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>The employed mitigation has the potential to greatly amplify the impact of a deliberately malicious sending of ICMPv6 PTB messages. Sensible ingress rate limiting can reduce the potential for impact; however, legitimate traffic may be lost once the rate limit is reached.</t>

      <t>The proxy replication results in devices not associated with the flow that generated the PTB being recipients of an ICMPv6 message which contains a fragment of a packet. This could arguably result in information disclosure. Recipient machines should be in a common administrative domain.</t>
    </section>

  </middle>

  <!--  *****BACK MATTER ***** -->

  <back>
    <!-- References split into informative and normative -->

    <!-- There are 2 ways to insert reference entries from the citation libraries:
     1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
     2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
        (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

     Both are cited textually in the same manner: by using xref elements.
     If you use the PI option, xml2rfc will, by default, try to find included files in the same
     directory as the including file. You can also define the XML_LIBRARY environment variable
     with a value containing a set of directories to search.  These can be either in the local
     filing system or remote ones accessed by http (http://domain/dir/... ).-->

<references title="Informative References">


&RFC4821;

</references>

  </back>
</rfc>
