<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
     which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!-- One method to get references from the online citation libraries.
     There has to be one entity for each item to be referenced. 
     An alternate method (rfc include) is described in the references. -->

<!ENTITY I-D.mahalingam-dutt-dcops-vxlan SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.mahalingam-dutt-dcops-vxlan.xml">
<!ENTITY I-D.sridharan-virtualization-nvgre SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.sridharan-virtualization-nvgre.xml">
<!ENTITY I-D.hasmit-otv SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.hasmit-otv.xml">
<!ENTITY I-D.eastlake-trill-rbridge-fine-labeling SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.eastlake-trill-rbridge-fine-labeling.xml">
<!ENTITY I-D.ietf-lisp SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-lisp.xml">
<!ENTITY I-D.ietf-6man-udpzero SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-6man-udpzero.xml">
<!ENTITY I-D.wkumari-dcops-l3-vmmobility SYSTEM "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.wkumari-dcops-l3-vmmobility.xml">
<!ENTITY RFC2784 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2784.xml">
<!ENTITY RFC2890 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2890.xml">
<!ENTITY RFC6325 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6325.xml">
<!ENTITY RFC2661 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2661.xml">
<!ENTITY RFC5213 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5213.xml">
<!ENTITY RFC5844 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5844.xml">
<!ENTITY RFC5845 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5845.xml">
<!ENTITY RFC6245 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6245.xml">
]>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
     please see http://xml.resource.org/authoring/README.html. -->
<!-- Below are generally applicable Processing Instructions (PIs) that most I-Ds might want to use.
     (Here they are set differently than their defaults in xml2rfc v1.32) -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
     (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="info" docName="draft-narten-nvo3-overlay-problem-statement-01" ipr="trust200902">
  <!-- category values: std, bcp, info, exp, and historic
     ipr values: full3667, noModification3667, noDerivatives3667
     you can add the attributes updates="NNNN" and obsoletes="NNNN" 
     they will automatically be output with "(if approved)" -->

  <!-- ***** FRONT MATTER ***** -->

  <front>
    <!-- The abbreviated title is used in the page header - it is only necessary if the 
         full title is longer than 39 characters -->

    <title abbrev="Overlays for Network Virtualization">Problem Statement: Overlays for Network Virtualization</title>

    <!-- add 'role="editor"' below for the editors if appropriate -->

    <!-- Another author who claims to be an editor -->

    <author fullname="Thomas Narten" initials="T." role="editor"
            surname="Narten">
      <organization>IBM</organization>

      <address>
        <email>narten@us.ibm.com</email>
      </address>
      
    </author>

    <author fullname="Murari Sridharan" initials="M." 
            surname="Sridharan">
      <organization>Microsoft</organization>

      <address>
        <email>muraris@microsoft.com</email>
      </address>
      
    </author>

    
    <author fullname="Dinesh Dutt" initials="D." 
            surname="Dutt">
      <organization>Cisco</organization>

      <address>
        <email>ddutt@cisco.com</email>
      </address>
      
    </author>
    
    <author fullname="David Black" initials="D." 
            surname="Black">
      <organization>EMC</organization>

      <address>
        <email>david.black@emc.com</email>
      </address>
      
    </author>
    
    <author fullname="Lawrence Kreeger" initials="L." 
            surname="Kreeger">
      <organization>Cisco</organization>

      <address>
        <email>kreeger@cisco.com</email>
      </address>
      
    </author>

    <date month="October" year="2011" />

    <!-- If the month and year are both specified and are the current ones, xml2rfc will fill 
         in the current day for you. If only the current year is specified, xml2rfc will fill 
	 in the current day and month for you. If the year is not the current one, it is 
	 necessary to specify at least a month (xml2rfc assumes day="1" if not specified for the 
	 purpose of calculating the expiry date).  With drafts it is normally sufficient to 
	 specify just the year. -->

    <!-- Meta-data Declarations -->

    <area>General</area>

    <workgroup>Internet Engineering Task Force</workgroup>

    <!-- WG name at the upperleft corner of the doc,
         IETF is fine for individual submissions.  
	 If this element is not present, the default is "Network Working Group",
         which is used by the RFC Editor as a nod to the history of the IETF. -->

    <keyword>template</keyword>

    <!-- Keywords will be incorporated into HTML output
         files in a meta tag but they have no effect on text or nroff
         output. If you submit your draft to the RFC Editor, the
         keywords will be used for the search engine. -->

    <abstract>
      <t>
      This document describes issues associated with providing multi-
      tenancy in large data center networks and an overlay-based network
      virtualization approach to addressing them.  A key multi-tenancy
      requirement is traffic isolation, so that a tenant's traffic is
      not visible to any other tenant.  This isolation can be achieved
      by assigning one or more virtual networks to each tenant such that
      traffic within a virtual network is isolated from traffic in other
      virtual networks.  The primary functionality required is
      provisioning virtual networks, associating a virtual machine's NIC
      with the appropriate virtual network, and maintaining that association as
      the virtual machine is activated, migrated and/or deactivated.  Use
      of an overlay-based approach enables scalable deployment on large
      network infrastructures.
      </t>
    </abstract>
  </front>

  <middle>
    <section title="Introduction">
      <t>Server virtualization is increasingly becoming the norm in
         data centers.  With server virtualization, each physical
         server supports multiple virtual machines (VMs), each running
         its own operating system, middleware and applications.
         Virtualization is a key enabler of workload agility, i.e.,
         allowing any server to host any application and providing the
         flexibility of adding, shrinking, or moving services within
         the physical infrastructure.  Server virtualization provides
         numerous benefits, including higher utilization, increased
         data security, reduced user downtime, reduced power usage,
         etc.
	 </t>

      
      <t>
      Large scale multi-tenant data centers are taking advantage of 
      the benefits of server virtualization to provide a new kind of 
      hosting, a virtual hosted data center. Multi-tenant data centers 
      are ones in which each tenant could belong to a different company 
      (in the case of a public provider) or a different department 
      (in the case of a internal company data center). Each tenant 
      has the expectation of a level of security and privacy separating 
      their resources from those of other tenants. Each virtual data center 
      looks similar to its physical counterpart, consisting of end 
      stations connected by a network, complete with services such as 
      load balancers and firewalls. The network within each virtual 
      data center can be a pure routed network, a pure bridged network 
      or a combination of bridged and routed network. The key 
      requirement is that each such virtual network is isolated from 
      the others, whether the networks belong to the same tenant or 
      different tenants.
      </t>

      <t>
	  This document outlines the problems encountered in scaling the 
	  number of isolated networks in a data center, as well as the 
	  problems of managing the creation/deletion, membership and span 
	  of these networks and makes the case that an overlay based 
	  approach, where individual networks are implemented within 
	  individual virtual networks that are dynamically controlled 
	  by a standardized control plane provides a number of advantages 
	  over current approaches.  The purpose of this document is to 
	  identify the set of problems that any solution has to address 
	  in building multi-tenant data centers. With this approach, the 
	  goal is to allow the construction of standardized, interoperable 
	  implementations to allow the construction of multi-tenant data 
	  centers.
       </t>

      <t> Section 2 describes the problem space details. Section 3 defines
      virtual networks.  Section 4 provides a general discussion of 
      overlays and standardization issues. Section 5 discusses the control 
      plane issues that require addressing for virtual networks.
      Section 6 and 7 discuss related work and further work.
      </t>
      </section>

      <section title="Problem Details">
      <t>
        The following subsections describe aspects of multi-tenant networking
        that pose problems for large scale network infrastructure.  Different
        problem aspects may arise based on the network architecture and scale.
      </t>

	<section title="Multi-tenant Environment Scale">
	  <t>
	    Cloud computing involves on-demand elastic provisioning of
	    resources for multi-tenant environments. A common
	    example of cloud computing is the public cloud, where a
	    cloud service provider offers these elastic services to
	    multiple customers over the same infrastructure.  This 
	    elastic on-demand nature in conjunction with trusted 
	    hypervisors to control network access by VMs calls for 
	    resilient distributed network control mechanisms.
	  </t>

	</section>

        <section title="Virtual Machine Mobility Requirements">
        <t>
	  A key benefit of server virtualization is virtual machine (VM) 
	  mobility. A VM can be migrated from one server to another, live 
	  i.e. as it continues to run and without shutting down the VM and 
	  restarting it at a new location. A key requirement for live 
	  migration is that a VM retain its IP address(es) and MAC address(es)
	  in its new location (to avoid tearing down existing 
	  communication). Today, servers are assigned 
      IP addresses based on their physical location, typically based on the
      ToR (Top of Rack) switch for the server rack or the VLAN configured
      to the server.  This works well for physical servers, which cannot move,
      but it restricts the placement and movement of the more mobile VMs within
      the data center (DC). Any solution for a scalable multi-tenant DC must
      allow a VM to be placed (or moved to) anywhere within the data center,
      without being constrained by the subnet boundary concerns of the host servers.
	</t>
        </section>

	<section title="Span of Virtual Networks">

	<t> Another use case is cross pod expansion.  A pod typically 
	consists of one or more racks of servers with its associated 
	network and storage connectivity.  Tenants may start off on a 
	pod and, due to expansion, require servers/VMs on other pods, 
	especially the case when tenants on the other pods are not 
	fully utilizing all their resources.  This use case requires 
        that virtual networks span multiple pods in order 
        to provide connectivity to all of the tenant's servers/VMs.
	  </t>

	</section>
	
	<section title="Inadequate Forwarding Table Sizes in Switches">
	  <t>
	    Today's virtualized environments place additional demands 
	    on the forwarding tables of switches. Instead of just one 
	    link-layer address per server, the switching infrastructure 
            has to learn addresses of the individual VMs (which could 
            range in the 100s per server).  This is a requirement since 
            traffic from/to the VMs to the rest of the physical network will 
	    traverse the physical network infrastructure. This places 
	    a much larger demand on the switches' forwarding table 
	    capacity compared to non-virtualized environments, causing more 
            traffic to be flooded or dropped when the addresses in use 
            exceeds the forwarding table capacity.
	  </t>
	  
	</section>
	
	<section title="Decoupling Logical and Physical Configuration">
	    <t>
	      Data center operators must be able to achieve high 
	      utilization of server and network capacity. For efficient and flexible 
              allocation, operators should be able to spread a virtual network instance 
              across servers in any rack in the data center.  It should 
              also be possible to migrate compute workloads to any 
              server anywhere in the network while retaining the 
              workload's addresses. This can be achieved today by 
              stretching VLANs (e.g., by using TRILL or OTV).
	    </t>

	    <t>
	      However, in order to limit the broadcast domain of each 
	      VLAN, multi-destination frames within a VLAN should 
	      optimally flow only to those devices that have that 
	      VLAN configured. When workloads migrate, the physical 
	      network (e.g., access lists) may need to be reconfigured 
	      which is typically time consuming and error prone.
	    </t>
	</section>
	
      <section title="Support Communication Between VMs and Non-virtualized Devices">
	    <t>
	      Within data centers, not all communication will be between VMs.
	      Network operators will continue to use non-virtualized servers for 
	      various reasons, traditional routers to provide L2VPN and L3VPN services, 
	      traditional load balancers, firewalls, intrusion detection engines and so 
	      on. Any virtual network solution should be capable of working with
              these existing systems.
	    </t>
	  </section>
	  
	  <section title="Overlay Design Characteristics">

	  <t>
	    There are existing layer 2 overlay protocols in existence,
	    but they were not necessarily designed to solve the
	    problem in the environment of a highly virtualized
	    data center.  Below are some of the characteristics of
	    environments that must be taken into account by the
	    overlay technology:
	    </t>
	  
	    <t>
	    <list style="numbers">
	      <t>Highly distributed systems.  The overlay should work
	      in an environment where there could be many thousands of
	      access switches (e.g. residing within the hypervisors)
	      and many more end systems (e.g. VMs) connected to them.
	      This leads to a distributed mapping system that puts a
	      low overhead on the overlay tunnel endpoints.

	      </t>
	      <t>
		Many highly distributed virtual networks with sparse
		connectivity.  Each virtual network could be highly
		dispersed inside the data center.  Also, along with
		expectation of many virtual networks, the number of
		end systems connected to any one virtual network is expected
		to be relatively low; Therefore, the percentage of
		access switches participating in any given virtual network
                would also be expected to be low.  For this
		reason, efficient pruning of multi-destination traffic 
                should be taken into consideration.
	      </t>

	      <t>
		Highly dynamic end systems.  End systems connected to
		virtual networks can be very dynamic, both in terms of
		creation/deletion/power-on/off and in terms of
		mobility across the access switches.
	      </t>

	      <t>
		Work with existing, widely deployed network Ethernet
		switches and IP routers without requiring wholesale
		replacement. The first hop switch that adds and removes 
		the overlay header will require new equipment and/or 
		new software.
	      </t>

	      <t>

		Network infrastructure administered by a single
		administrative domain.  This is consistent with
		operation within a data center, and not across the
		Internet.
	      </t>
	      	      
	    </list>
	  </t>
	</section>
	
	  
    </section>
      
      <section title="Defining Virtual Networks and Tenants">
      <t>
        Virtual Networks are used to isolate a tenant's traffic from other
        tenants (or even traffic within the same tenant that requires isolation).
        There are two main characteristics of virtual networks:
      </t>
        <t>
        <list style="numbers">
          <t>
            Providing network address space that is isolated from other 
            virtual networks. The same network addresses may be used in 
            different virtual networks on the same underlying network infrastructure.
          </t>
          <t>
            Limiting the scope of frames to not exit a virtual network except through
            controlled exit points or "gateways".
          </t>
        </list>
        </t>
      
        <section title="Limitations of Existing Virtual Network Models">
        <t>
          Virtual networks are not new to networking. VLANs are a 
          well known construct in the networking industry. VLAN is 
          a bridging construct which provides the semantics of virtual 
          networks mentioned above: a MAC address is unique within a 
          VLAN, but not necessarily across VLANs and broadcast traffic 
          is limited to the VLAN it originates from. In the case of IP 
          networks, routers have the concept of a Virtual Routing and 
          Forwarding (VRF).  The same router can run multiple instances 
          of routing protocols, each with their own forwarding table.  
          Each instance is referred to as a VRF, which is a mechanism 
          that provides address isolation.
          Since broadcasts are never forwarded across IP subnets, 
          limiting broadcasts are not applicable to VRFs. In the case of 
          both VLAN and VRF, the forwarding table is looked up using 
          the tuple {VLAN, MAC address} or {VRF, IP address}.
        </t>
        
        <t>
          But there are two problems with these constructs. VLANs are 
          a pure bridging construct while VRF is a pure routing 
          construct. VLANs are carried along with a frame to allow 
          each forwarding point to know what VLAN the frame belongs 
          to. VLAN today is defined as a 12 bit number, limiting the 
          total number of VLANs to 4096 (though typically, this number 
          is 4094 since 0 and 4095 are reserved). Due to the large 
          number of tenants that a cloud  provider might service, 
          the 4094 VLAN limit is often inadequate.  In  addition, 
          there is often a need for multiple VLANs per tenant, which 
          exacerbates the issue.
          </t>
          
          <t>
          There is no VRF indicator carried in frames.  The VRF is 
          derived at each hop using a combination 
          of incoming interface and some information in the frame. 
          Furthermore, the VRF model has typically assumed that a 
          separate control plane governs the population of the 
          forwarding table within that VRF. Thus, a traditional VRF 
          model assumes multiple, independent control planes and has 
          no specific tag within a frame to identify the VRF of the frame.
        </t>
        </section>
      
        <section title="Virtual Network Instance">
        <t>
          To overcome the limitations of a traditional VLAN or VRF model, 
          we define a new mechanism for virtual networks called a virtual 
          network instance. Each virtual network is assigned a virtual 
          network instance ID, shortened to VNID for convenience. A 
          virtual network instance provides the semantics of a virtual 
          network: address disambiguation and multi-destination frame 
          scoping. A virtual network can be either routed or bridged. 
          So, a VNID can be used for both bridged networks and routed 
          networks and so is unlike a VLAN or a VRF. To build large 
          multi-tenant data centers, a larger number space than the 12b 
          VLAN is required. 24 bits is the most common value identified 
          by multiple solutions that attempt to address this problem space 
          (or similar problem spaces). To simplify the building and 
          administration of these large data centers, we require that the 
          VNID be carried with each frame (similar to a VLAN, but 
          unlike a VRF). Finally, because of the nature of a virtual data 
          center and to allow scaling virtual networks to massive scales, 
          we don't require a separate control plane to run for each virtual 
          network. We'll identify other possible mechanisms to populate the 
          forwarding tables for virtual networks in section 5.1.
        </t>
        </section>

        <section title="Tenant">
        <t>
        Tenant is the administrative entity that that is responsible for and
        manages a specific virtual network and its associated services 
        (whether virtual or physical). In a cloud environment, a tenant 
        would correspond to the customer that has defined and is using a
        particular virtual network. However, there is a one-to-many mapping
        between tenants and virtual network instances.  A single tenant may
        operate multiple individual virtual networks, each associated with a
        different service.
        </t>
        </section>
      
      </section>
      
      <section title="Network Overlays">
      
      <t>
      To address the problems of decoupling physical and logical 
      configuration and allowing VM mobility without exploding the 
      forwarding table sizes in the switches and routers, a network 
      overlay model can be used.
      </t>
	  <t>
	  The idea behind an overlay is quite straightforward. The original 
	  frame is encapsulated by the first hop network device. The 
	  encapsulation identifies the destination as the device that 
	  will perform the decapsulation before delivering the frame to 
	  the endpoint. The rest of the network forwards the frame based 
	  on the encapsulation header and can be oblivious to the payload 
	  that is carried inside. To avoid belaboring the point each time, 
	  the first hop network device can be a traditional switch or 
	  router or the virtual switch residing inside a hypervisor. 
	  Furthermore, the endpoint can be a VM or it can be a physical 
	  server. Some examples of network overlays are tunnels such as 
	  IP GRE <xref target="RFC2784"></xref>, LISP<xref target="I-D.ietf-lisp"></xref> or 
	  TRILL <xref target="RFC6325"></xref>.
	  </t>
	  <t>
	  With an overlay, the VNID can be carried within the 
	  overlay header so that every frame has its VNID explicitly 
	  identified in the frame. Since both routed and bridged semantics 
	  can be supported by a virtual data center, the original frame 
	  carried within the overlay header can be an Ethernet frame 
	  complete with MAC addresses or just the IP packet.
	  </t>
	  
      <section title="Benefits of an Overlay Approach">
	<t>
	  The use of a large (e.g., 24-bit) VNID would 
	  allow 16 million distinct virtual networks within a single data center, 
	  eliminating current VLAN size limitations.  This VNID 
	  needs to be carried in the data plane along with the packet.  
	  Adding an overlay header provides a place to carry this 
	  VNID.
	</t>

	<t>
      A key aspect of overlays is the decoupling of the "virtual" MAC and
      IP addresses used by VMs from the physical network infrastructure and
      the infrastructure IP addresses used by the data center.  If a VM
      changes location, the switches at the edge of the overlay simply
      update their mapping tables to reflect the new location of the VM
      within the data center's infrastructure space.  Because an overlay 
      network is used, a VM can now be located anywhere in the data center 
      that the overlay reaches without regards to traditional constraints 
      implied by L2 properties such as VLAN numbering, or the span of an 
      L2 broadcast domain scoped to a single pod or access switch.
	</t>

	<t>
      Multi-tenancy is supported by isolating the traffic of one virtual
      network instance from traffic of another.  Traffic from one virtual
      network instance cannot be delivered to another instance without 
      (conceptually) exiting the instance and entering the other instance via 
      an entity that has connectivity to both virtual network instances.  
      Without the existence of this entity, tenant traffic remains isolated 
      within each individual virtual network instance.  External 
      communications (from a VM within a virtual network instance to a machine 
      outside of any virtual network instance, e.g. on the Internet) is 
      handled by having an ingress switch forward traffic to an 
      external router, where an egress switch decapsulates a tunneled packet 
      and delivers it to the router for normal processing.  This router is 
      external to the overlay, and behaves much like existing external 
      facing routers in data centers today.
	</t>

	<t>
	  Overlays are designed to allow a set of VMs to be placed within a 
          single virtual network instance, whether that virtual network provides the 
          bridged network or a routed network.
	</t>
    </section>

	<section title="Standardization Issues for Overlay Networks">
	
	  <section title="Overlay Header Format">
	  <t>
	    Different overlay header formats are possible as are 
	    different possible encodings of the VNID. Existing 
	    overlay headers maybe extended or new ones defined. This 
	    document does not address the exact header format or 
	    VNID encoding except to state that any solution MUST:
	  </t>
	  
	  <t>
	  <list style="numbers">
	  <t>
	    Carry the VNID in each frame
	  </t>
	  <t>
	    Allow the payload to be either a complete Ethernet frame 
	    or only an IP packet
	  </t>
	  </list>
	  </t>
	  </section>
	  
	  <section title="Fragmentation">
	  <t>
	    Whenever tunneling is used, one faces the potential problem that
	    the packet plus the encapsulation overhead will exceed the MTU 
	    of the path to the egress router.  If the outer encapsulation 
	    is IP, fragmentation could be left to the IP layer, or it could 
	    be done at the overlay level in a more optimized fashion that is 
	    independent of the overlay encapsulation header, or it could be 
	    left out altogether, if it is believed that data center networks 
	    can be engineered to prevent MTU issues from arising.
	  </t>
	  <t>
        Related to fragmentation is the question of how best to handle Path
        MTU issues, should they occur.  Ideally, the original source of any
        packet (i.e, the sending VM) would be notified of the optimal MTU to
        use.  Path MTU problems occurring within an overlay network would 
        result in ICMP MTU exceeded messages being sent back to the egress
        tunnel switch at the entry point of the overlay.  If the switch is
        embedded within a hypervisor, the hypervisor could notify the VM of a
        more appropriate MTU to use.  It may be appropriate to specify a set
        of best practices for implementers related to the handling of Path
        MTU issues.
      </t>
	  </section>

        <section title="Checksums and FCS">
	<t>
	When tunneling packets, both the inner and outer headers could have
        their own checksum, duplicating effort and impacting
        performance. Therefore, we strongly recommend that any solution carry 
        only one set of checksum or frame FCS.
	</t>
	<t>
        When the inner packet is TCP or UDP, they already include their own 
        checksum, and adding a second outer checksum (using the same 1's 
        complement algorithm) provides little value. Similarly, if the inner
        packet is an Ethernet frame, the frame FCS protects the original 
        frame and a new frame FCS over both the original frame and the 
        overlay header protects the new encapsulated frame. 
        </t>
        <t>
        In IPv4, UDP checksums can be disabled on a per-packet basis simply
        by setting the checksum field to zero. IPv6, however, specifies that 
        UDP checksums must always be included. But even for IPv6, the LISP 
        protocol<xref target="I-D.ietf-lisp"></xref> already allows a 
        zero checksum field. The 6man working group is also currently
        considering relaxing the IPv6 UDP checksum requirement
        <xref target="I-D.ietf-6man-udpzero"></xref>.
	</t>
	<t>
        For Ethernet frames, L2 overlays such as TRILL already mandate 
        only a single frame FCS. 
	</t>
	</section>

	<section title="Middlebox Traversal">
        <t>
	One issue to consider is to whether the overlay will need to run
        over networks that include middleboxes such as NAT. Middleboxes may
        have difficulty properly supporting multicast or other aspects of
        an overlay header.  Inside a data center, it may well be the case
        that middlebox traversal is a non-issue. But if overlays are
        extended across the broader Internet, the presence of middleboxes
        may be of concern.
        </t>
        </section>

	  <section title="OAM">
	    <t>
	      Successful deployment of an overlay approach will likely require 
	      appropriate Operations, Administration and Maintenance (OAM)
	      facilities.
	    </t>
	  </section>
	  
	  </section>
	  
	</section>


	<section title="Control Plane">
	<t>
      The control plane needs to address the following pieces, at least:
	</t>

      <t>
	  <list style="numbers">
	  <t>
	    A mechanism to populate the forwarding table of a virtual 
	    network instance.
	  </t>
	  <t>
	    A mechanism to handle multi-destination frames within a 
	    virtual network instance.
	  </t>
	  <t>
	    A mechanism to allow an endpoint to inform the access 
	    switch which virtual network instance it wishes to join 
            on a virtual network interface.
	  </t>
	  <t>
	    A mechanism to allow an endpoint to inform the access switch
	    about its leaving the network so that the access switch can 
	    clean up state.
	  </t>
	  </list>
	  </t>
	  
	  <section title="Populating the Forwarding Table of a Virtual Network Instance">
	    <t>
	      When an access switch has to forward a frame from one endpoint to 
	      another, across the network, it has to consult some form of a forwarding 
	      table. When we use network overlays, the problem boils down to deriving the 
	      mapping between the inner and outer addresses i.e. deriving the destination 
	      address in the overlay header based on the destination address sent by the 
	      endpoint. Two well known mechanisms for populating the forwarding table (or 
	      deriving the mapping table) of a switch are (i) via a routing control 
	      protocol and (ii) learning from the data plane as Ethernet bridges do. 
	      Another mechanism is through a centralized mapping database. Any solution must 
              avoid problems associated with scaling a virtual network instance across 
              a large data center. 
	    </t>
	  </section>
	  
	  <section title="Handling Multi-destination Frames">
	    <t>
	      Another aspect of address mapping concerns the handling of 
	      multi-destination frames, i.e. broadcast and multicast frames, or the 
	      delivery of unicast packets when no mapping exists. Associating a 
	      infrastructure multicast address is one possible way of connecting 
              together all the machines belonging to the same VNID. However, existing 
              multicast implementations do not scale to efficiently handle hundreds 
              of thousands of multicast groups, as would be required if one multicast 
              group were assigned to each VNID.
	   </t>
      </section>
      
      <section title="Associating a VNID With An Endpoint">
        <t>
          When an endpoint, such as VM or physical server, connects to the infrastructure, 
          we must define a mechanism to allow the endpoint to identify to the access 
          switch the network instance that it wishes to join. Typically, it is a 
          virtual NIC (the one connected to the VM) coming up that triggers this 
          association. The access switch can then determine the VNID to be 
          associated with this virtual NIC. A standard protocol that all types of 
          overlay encapsulation points can use to identify the VNID associated 
          with an endpoint will be beneficial for supporting multi-vendor 
          implementations.  This protocol could also be used to distribute any per 
          virtual network information (e.g. a multicast group address). This signaling 
          can provide the stimulus to trigger the overlay termination points to 
          perform any actions needed within the infrastructure network (e.g. use IGMP to 
          join a multicast group).
        </t>
      </section>
      
      <section title="Disassociating a VNID on Termination or Move">
        <t>
          To enable cleaning up state in the access switch, we must define a mechanism 
          to allow an endpoint to signal its disconnection from the network.
        </t>
      </section>

	 </section>

    <section title="Related Work">

	<section title="ARMD">
	  <t>
	    ARMD is chartered to look at data center scaling issues
	    with a focus on address resolution. ARMD is currently
	    chartered to develop a problem statement and is not
	    currently developing solutions. While an overlay-based
	    approach may address some of the "pain points" that have
	    been raised in ARMD (e.g., better support for
	    multi-tenancy), an overlay approach may also push some of
	    the L2 scaling concerns (e.g., excessive flooding) to the
	    IP level (flooding via IP multicast). Analysis will be
	    needed to understand the scaling trade offs of an overlay
	    based approach compared with existing approaches. On the
	    other hand, existing IP-based approaches such as proxy ARP
	    may help mitigate some concerns.
	  </t>
	</section>
	<section title="TRILL">
	  <t>
	    TRILL is an L2 based approach aimed at improving
	    deficiencies and limitations with current Ethernet
	    networks. Approaches to extend TRILL to support more 
	    than 4094 VLANs are currently under investigation 
	    <xref target="I-D.eastlake-trill-rbridge-fine-labeling"></xref>
	  </t>  
	 </section>
	<section title="L2VPNs">
	  <t>
	    The IETF has specified a number of approaches for
	    connecting L2 domains together as part of the L2VPN
	    Working Group. That group, however has historically been focused on
	    Provider-provisioned L2 VPNs, where the service provider
	    participates in management and provisioning of the VPN. In
	    addition, much of the target environment for such
	    deployments involves carrying L2 traffic over
	    WANs. Overlay approaches are intended be used within data
	    centers where the overlay network is managed by the
	    data center operator, rather than by an outside
	    party. While overlays can run across the Internet as well,
	    they will extend well into the data center itself (e.g., up
	    to and including hypervisors) and include large numbers of
	    machines within the data center itself.
	  </t>

	  <t>
	    Other L2VPN approaches, such as
	    L2TP <xref target="RFC2661"></xref> require significant
	    tunnel state at the encapsulating and decapsulating end
	    points. Overlays require less tunnel state than other
	    approaches, which is important to allow overlays to scale
	    to hundreds of thousands of end points. It is assumed that
	    smaller switches (i.e., virtual switches in hypervisors or
	    the physical switches to which VMs connect) will be part
	    of the overlay network and be responsible for
	    encapsulating and decapsulating packets.
	  </t>
	</section>

	<section title="Proxy Mobile IP">
	  <t>
	    Proxy Mobile
	    IP <xref target="RFC5213"></xref> <xref target="RFC5844"></xref>
	    makes use of the GRE Key
	    Field <xref target="RFC5845"></xref> <xref target="RFC6245"></xref>,
	    but not in a way that supports multi-tenancy.
	  </t>
	</section>
	
	<section title="LISP">
	  <t>
	    LISP<xref target="I-D.ietf-lisp"></xref>
	    essentially provides an IP over IP overlay where the internal 
	    addresses are end station Identifiers and the outer IP addresses 
	    represent the location of the end station within the core IP 
	    network topology. The LISP overlay header uses a 24 bit Instance 
	    ID used to support overlapping inner IP addresses.
	  </t>

	</section>

        <section title="Individual Submissions">
	<t>
	Many individual submissions also look to addressing some or all of the 
	issues addressed in this draft. Examples of such drafts are VXLAN
	<xref target="I-D.mahalingam-dutt-dcops-vxlan"></xref>, NVGRE
	<xref target="I-D.sridharan-virtualization-nvgre"></xref> and Virtual
        Machine Mobility in L3 networks<xref target="I-D.wkumari-dcops-l3-vmmobility"></xref>. 
	</t>
	</section>
	

      </section>
	
      <section title="Further Work">
	<t>
          It is believed that overlay-based approaches may be able to
	  reduce the overall amount of flooding and other multicast
	  and broadcast related traffic (e.g, ARP and ND) currently
	  experienced within current data centers with a large flat L2
	  network. Further analysis is needed to characterize expected
	  improvements.
	</t>
      </section>

      <section title="Summary">
	<t>
	  This document has argued that network virtualization using
	  L3 overlays addresses a number of issues being faced as data
	  centers scale in size. In addition, careful consideration of
	  a number of issues would lead to the development of
	  interoperable implementation of virtualization overlays.
	</t>

      </section>

    <section anchor="Acknowledgments" title="Acknowledgments">
      <t>Helpful comments and improvements to this document have come
      from Ariel Hendel, Vinit Jain, and Benson Schliesser.</t>

    </section>

    <!-- Possibly a 'Contributors' section ... -->

    <section anchor="IANA" title="IANA Considerations">
      <t>This memo includes no request to IANA.</t>

    </section>

    <section anchor="Security" title="Security Considerations">
      <t>TBD</t>
    </section>

  </middle>

  <!--  *****BACK MATTER ***** -->

  <back>
    <!-- References split into informative and normative -->

    <!-- There are 2 ways to insert reference entries from the citation libraries:
     1. define an ENTITY at the top, and use "ampersand character"RFC2629; here (as shown)
     2. simply use a PI "less than character"?rfc include="reference.RFC.2119.xml"?> here
        (for I-Ds: include="reference.I-D.narten-iana-considerations-rfc2434bis.xml")

     Both are cited textually in the same manner: by using xref elements.
     If you use the PI option, xml2rfc will, by default, try to find included files in the same
     directory as the including file. You can also define the XML_LIBRARY environment variable
     with a value containing a set of directories to search.  These can be either in the local
     filing system or remote ones accessed by http (http://domain/dir/... ).-->

    <references title="Informative References">


      <!-- Here we use entities that we defined at the beginning. -->

      &I-D.mahalingam-dutt-dcops-vxlan;
      &I-D.sridharan-virtualization-nvgre;
      &I-D.wkumari-dcops-l3-vmmobility;
      &I-D.eastlake-trill-rbridge-fine-labeling;
      &I-D.ietf-lisp;
      &I-D.ietf-6man-udpzero;
      &I-D.hasmit-otv;
      &RFC2661;
      &RFC2784;
      &RFC2890;
      &RFC5213;
      &RFC5844;
      &RFC5845;
      &RFC6245;
      &RFC6325;

    </references>

    <!-- Change Log
      -->
  </back>
</rfc>
