<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
     which is available here: http://xml.resource.org. -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd">
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<!-- used by XSLT processors -->
<!-- For a complete list and description of processing instructions (PIs), 
     please see http://xml.resource.org/authoring/README.html. -->
<?rfc strict="yes" ?>
<!-- give errors regarding ID-nits and DTD validation -->
<!-- control the table of contents (ToC) -->
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="3"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space 
     (using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="info" docName="draft-song-opsawg-ntf-00" ipr="trust200902">
  <front>
    <title abbrev="Network Telemetry Framework">Network Telemetry Framework</title>

    <author fullname="Haoyu Song" initials="H." role="editor" surname="Song">
      <organization>Huawei</organization>

      <address>
        <postal>
          <street>2330 Central Expressway</street>

          <city>Santa Clara</city>

          <country>USA</country>
        </postal>

        <email>haoyu.song@huawei.com</email>
      </address>
    </author>

    <author fullname="Tianran Zhou" initials="T." surname="Zhou">
      <organization>Huawei</organization>

      <address>
        <postal>
          <street>156 Beiqing Road</street>

          <city>Beijing, 100095</city>

          <country>P.R. China</country>
        </postal>

        <email>zhoutianran@huawei.com</email>
      </address>
    </author>

    <author fullname="Zhenbin Li" initials="ZB." surname="Li">
      <organization>Huawei</organization>

      <address>
        <postal>
          <street>156 Beiqing Road</street>

          <city>Beijing, 100095</city>

          <country>P.R. China</country>
        </postal>

        <email>lizhenbin@huawei.com</email>
      </address>
    </author>

    <author fullname="Giuseppe Fioccola" initials="G." surname="Fioccola">
      <organization>Telecom Italia</organization>

      <address>
        <postal>
          <street>Via Reiss Romoli, 274</street>

          <city>Torino</city>
		  
		  <code>10148</code>

          <country>Italy</country>
        </postal>

        <email>giuseppe.fioccola@telecomitalia.it</email>
      </address>
    </author>

    <author fullname="Zhenqiang Li" initials="ZQ." surname="Li">
      <organization>China Mobile</organization>

      <address>
        <postal>
          <street>No. 32 Xuanwumenxi Ave., Xicheng District</street>

          <city>Beijing, 100032</city>

          <country>P.R. China</country>
        </postal>

        <email>lizhenqiang@chinamobile.com</email>
      </address>
    </author>

    <author fullname="Pedro Martinez-Julia" initials="P." surname="Martinez-Julia">
      <organization>NICT</organization>
      <address>
        <postal>
          <street>4-2-1, Nukui-Kitamachi</street>
          <city>Koganei</city>
          <region>Tokyo</region>
          <code>184-8795</code>
          <country>Japan</country>
        </postal>
        <phone>+81 42 327 7293</phone>
        <email>pedro@nict.go.jp</email>
      </address>
    </author>

    <author fullname="Laurent Ciavaglia" initials="L." surname="Ciavaglia">
      <organization>Nokia</organization>

      <address>
        <postal>
          <street></street>

          <city>Villarceaux</city>
		  
		  <code>91460</code>

          <country>France</country>
        </postal>

        <email>laurent.ciavaglia@nokia.com</email>
      </address>
    </author>


    <author fullname="Aijun Wang" initials="A." surname="Wang">
      <organization>China Telecom</organization>
    
      <address>
        <postal>

          <street>Beiqijia Town, Changping District</street>
          <city>Beijing, 102209</city>

          <country>P.R. China</country>
        </postal>

        <email>wangaj.bri@chinatelecom.cn</email>
      </address>
    
    </author>

    <date day="8" month="August" year="2018"/>

    <area>Operation and Management Area</area>
    <workgroup>OPSAWG</workgroup>

    <!---->

    <keyword>Telemetry, OAM</keyword>

    <abstract>
	    <t>This document suggests the necessity of an architectural framework for network telemetry 
	       in order to meet the current and future network operation requirements.
	       The defining characteristics of network telemetry shows a clear distinction from the conventional network OAM concept; 
	       hence the network telemetry demands new techniques and protocols.
	       This document clarifies the terminologies and classifies the categories and components of a network telemetry framework. 
	       The requirements, challenges, existing solutions, and future directions are discussed for each category.
	       The network telemetry framework and the taxonomy help to set a common ground for the collection of related works 
	       and put future technique and standard developments into perspective.</t>
    </abstract>

    <note title="Requirements Language">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
      "OPTIONAL" in this document are to be interpreted as described in
      BCP 14 <xref target="RFC2119"></xref><xref target="RFC8174"></xref> when, and only when, they appear in all
      capitals, as shown here.</t>
    </note>
  </front>

  <middle>
      <section title="Motivation">
   
      <t>The advance of AI/ML technologies gives networks an unprecedented opportunity to realize network autonomy with closed control loops.  
      An intent-driven autonomous network is the logical next step for network
      evolution following SDN, aiming to reduce (or even eliminate) human labor, make the most efficient use
      of network resources, and provide better services more aligned with customer requirements.
      Although we still have a long way to reach the ultimate goal, the journey has started nevertheless.</t>

      <t>The storage and computing technologies are already mature enough to be able to retain and process
      a huge amount of data and make real-time inference. 
      Tools based on machine learning technologies and big data analytics are powerful in
      detecting and reacting on network faults, anomalies, and policy violations. 
      In turn, the network policy updates for planning, intrusion prevention, optimization, and self-healing can be applied. 
      Some tools can even predict future events based on historical data.</t>  
	      
      <t>However, the networks fail to keep pace with such data need.
	 The current network architecture, protocol suite, and system design are not ready yet to provide enough quality data.
        In the remaining of this section, first we identify a few key network operation use cases
	that network operators need the most. These use cases are also the essential functions of the future autonomous networks. 
	Next, we show why the current network OAM techniques and protocols are not
	sufficient to meet the requirements of these use cases. The discussion underlines the need of a new brood of techniques and protocols
	which we put under an umbrella term - network telemetry.
      </t>    
      
 
      <section title="Use Cases"> 

        <t>All these use cases involves the
        data extracted from the network data plane and sometimes from the
	network control plane and management plane. </t>

        <t><list style="hanging">
            <t hangText="Intent and Policy Compliance:">Network policies are the rules
            that constraint the services for network access, provide differentiate within 
	    a service, or enforce specific treatment on the traffic. For example, a
            service function chain is a policy that requires the selected
	    flows to pass through a set of network functions in order. An intents is a high-level abstract policy  
	    which requires a complex translation and mapping process before being applied on networks.
	    While a policy is enforced, the compliance needs to be verified and monitored
            continuously.</t>

            <t hangText="SLA Compliance:">A Service-Level Agreement (SLA) defines
            the level of service a user expects from a network operator, which
            include the metrics for the service measurement and remedy/penalty
            procedures when the service level misses the agreement. Users need
            to check if they get the service as promised and network operators
            need to evaluate how they can deliver the services that can meet
            the SLA.</t>

            <t hangText="Root Cause Analysis:">Network failure often involves
            a sequence of chained events and the source of the failure is not
            straightforward to identify, especially when the failure is
            sporadic. While machine learning or other data analytics
            technologies can be used for root cause analysis, it up to the
            network to provide all the relevant data for analysis.</t>

            <t hangText="Load Balancing, Traffic Engineering, and Network Planning:">Network operators are
            motivated to optimize their network utilization for better ROI or
	    lower CAPEX, as well as differentiation across services and/or users of a given service.  
	    The first step is to know the real-time network
            conditions before applying policies to steer the user traffic or
            adjust the load balancing algorithm.  In some cases network
            micro-bursts need to be detected in a very short time-frame so
	    that fine grained traffic control can be applied to avoid possible network congestion.
            The long term network capacity planning and topology augmentation also rely on the accumulated 
            data of the network operation.</t>

            <t hangText="Event Tracking and Prediction:">Network visibility is critical for a healthy network operation.
            Numerous network events are of interest to network operators. For example,
	    Network operators always want to learn where and why packets are dropped for an application flow.  
	    They also want to be warned by some early signs that some component is going to fail so 
	    the proper fix or replacement can be made in time. </t>
          </list></t>

        </section>

	<section title="Challenges">

	<t>The conventional OAM techniques, as described in <xref target="RFC7276"></xref>,
        are not sufficient to support the above use cases for the following reasons:</t>

        <t><list style="symbols">


            <t>Most use cases need to continuously monitor the network and
            dynamically refine the data collection in real-time and interactively. 
	    The poll-based low-frequency data collection is ill-suited for these
            applications. Streaming data directly pushed from the data source
            is preferred.</t>

            <t>Various data is needed from any place ranging from the packet
            processing engine to the QoS traffic manager. Traditional data
            plane devices cannot provide the necessary probes. An open and
            programmable data plane is therefore needed.</t>

            <t>Many application scenarios need to correlate data from multiple
            sources (e.g., from distributed nodes or from different network
            plane). A piecemeal solution is often lacking the capability to
            consolidate the data from multiple sources. The composition of a
            complete solution, as partly proposed by <xref
                target="I-D.pedro-nmrg-anticipated-adaptation">ARCA</xref>,
            will be empowered and guided by a comprehensive framework.</t>

            <t>The passive measurement techniques can either consume too much
            network resources and render too much redundant data, or lead to
            inaccurate results. The active measurement techniques are
            indirect, and they can interfere with the user traffic. We need
            techniques that can collect direct and on-demand data from user
            traffic.</t>
          </list></t>
      </section>

      <section title="Glossary">

	      <t>Before further discussion, we list some key terminology and acronyms used in this documents. We make an intended distinction 
		      between network telemetry and network OAM.</t>      

	<t><list style="hanging">
	  <t hangText="AI:"> Artificial Intelligence. Use machine-learning based technologies to automate network operation.</t>
	  <t hangText="BMP:"> BGP Monitoring Protocol</t>
	  <t hangText="DNP:"> Dynamic Network Probe </t>
	  <t hangText="DPI:"> Deep Packet Inspection </t>
	  <t hangText="gNMI:"> gPRC Network Management Interface </t>
	  <t hangText="gRPC:"> gRPC Remote Procedure Call </t>
          <t hangText="IDN:"> Intent-Driven Network</t>
	  <t hangText="IPFIX:"> IP Flow Information Export Protocol</t>
	  <t hangText="IPFPM:"> IP Flow Performance Measurement</t>
	  <t hangText="IOAM:"> In-situ OAM </t>
	  <t hangText="NETCONF:"> Network Configuration Protocol</t>
	  <t hangText="Network Telemetry:"> A general term for a new brood of network visibility techniques and protocols, with
		the characteristics defined in this document. Network telemetry enables smooth evolution toward intent-driven autonomous networks.</t>
	  <t hangText="NMS:"> Network Management System</t>
	  <t hangText="OAM:"> Operations, Administration, and Maintenance. A group of network
   		management functions that provide network fault indication, fault
   		localization, performance information, and data and diagnosis
   		functions. Most conventional network monitoring techniques and protocols belong to network OAM.</t>
	  <t hangText="SNMP:"> Simple Network Management Protocol </t>
	  <t hangText="YANG:"> A data modeling language for NETCONF </t>
	  <t hangText="YANG FSM:"> A YANG model to define device side finite state machine </t>
	  <t hangText="YANG PUSH:"> A method to subscribe pushed data from remote YANG datastore </t>
	</list></t>

      </section>

      <section title="Network Telemetry">

        <t>For a long time, network operators have relied upon protocols such as
	<xref target="RFC1157">SNMP</xref> to monitor the network. SNMP can only provide limited
        information about the network. Since SNMP is poll-based, it incurs low
        data rate and high processing overhead. Such drawbacks make SNMP
        unsuitable for today's automatic network applications.</t>

        <t>Network telemetry has emerged as a mainstream technical term to
        refer to the newer techniques of data collection and consumption,
        distinguishing itself form the convention
        techniques for network OAM. It is expected that network
        telemetry can provide the necessary network visibility for autonomous
        networks, address the shortcomings of conventional
        OAM techniques, and allow for the emergence of new techniques bearing certain characterisitcs.</t>


        <t>One key difference between the network telemetry and the network OAM is that 
	  the network telemetry assumes an intelligent machine in the center of a closed control loop, 
	  while the network OAM assumes the human network operators in the middle of an open control loop.
	  The network telemetry can directly trigger the automated network operation; 
	  The conventional OAM tools only help human operators to monitor and diagnose the networks and guide manual network operations. 
	  The different assumptions lead to very different techniques. 
        </t>

        <t>Although the network telemetry techniques are just emerging and subject to continuous evolution,
        several defining characteristics of network telemetry have been well
        accepted:</t>

        <t><list style="symbols">
            <t>Push and Streaming: Instead of polling data from network devices, the telemetry
            collector subscribes to the streaming data pushed from the data
	    source in network devices.</t>

            <t>Volume and Velocity: The telemetry data is intended to be consumed by machine rather than by human. Therefore,
		    the data volume is huge and the processing is often in realtime.</t>

	    <t>Normalization and Unification: Telemetry aims to address the overall network automation needs. 
		    The piecemeal solutions offered by the conventional OAM approach are no longer suitable.
		    Efforts need to be made to normalize the data representation and unify the protocols.
	    </t>

            <t>Model-based: The data is model-based which allows applications to configure
		    and consume data with ease.
	    </t>

            <t>Data Fusion: The data for a single application can come from multiple data
            sources (e.g., cross domain, cross device, and cross layer) and
	    needs to be correlated to take effect.</t>

            <t>Dynamic and Interactive: Since the network telemetry means to be used in a closed control loop for network automation, 
	    it needs to run continuously and adapt to the dynamic and interactive queries from the network operation controller.
	    </t>

          </list></t>

        <t>In addition, the ideal network telemetry solution should also
        support the following features:</t>

        <t><list style="symbols">
            <t>In-Network Customization: The data can be customized in network at run-time to cater to the specific
            need of applications. This needs the support of a programmable
            data plane which allows probes to be deployed at flexible
            locations.</t>
            
            <t>Direct Data Plane Export: The data originated from data plane can be directly exported to the data consumer for efficiency,
	    especially when the data bandwidth is large and the real-time processing is required.
	    </t>

            <t>In-band Data Collection: In addition to the passive and active data collection approaches, the new hybrid approach allows
            to directly collect data for any target flow on its entire forwarding path.  
	    </t>

	    <t>Non-intrusive: The telemetry system should not fall into the trap of the "observer effect". That is, it should not 
	      change the network behavior or affect the forwarding performance.</t>    

          </list></t>
      </section>
    </section>

    <section title="The Necessity of a Network Telemetry Framework">

        <t>Big data analytics and machine-learning based AI technologies are
        applied for network operation automation, relying on abundant data from networks. The
        single-sourced and static data acquisition cannot meet the data
        requirements. It is desirable to have a framework that integrates
        multiple telemetry approaches from different layers, and
        allows flexible combinations for different applications. The framework
        will benefit application development for the following
        reasons.</t>

	<t><list style="symbols">


            <t>The future autonomous networks will require a holistic view on network visibility.
            All the use cases and applications need to be supported uniformly and coherently under a single intelligent agent. 
	    Therefore, the protocols and mechanisms should be consolidated into 
	    a minimum yet comprehensive set. A telemetry framework can help to normalize  
            the technique developments.</t>	    


            <t>Network visibility presents multiple viewpoints. For example,
            the device viewpoint takes the network infrastructure as the
            monitoring object from which the network topology and device
            status can be acquired; the traffic viewpoint takes the flows or
            packets as the monitoring object from which the traffic quality
            and path can be acquired. An application may need to switch its
            viewpoint during operation. It may also need to correlate a
            service and its network experience to acquire the comprehensive
            information.</t>

            <t>Applications require network telemetry to be elastic in
            order to efficiently use the network resource and reduce the
            performance impact. Routine network monitoring covers the
            entire network with low data sampling rate. When issues arise or
            trends emerge, the telemetry data source can be modified and the
            data rate can be boosted.</t>

            <t>Efficient data fusion is critical for applications to reduce
            the overall quantity of data and improve the accuracy of
            analysis.</t>
          </list></t>

        <t>So far, some telemetry related work has been done within IETF.
        However, this work is fragmented and scattered in different working
        groups. The lack of coherence makes it difficult to assemble a
        comprehensive network telemetry system and causes repetitive and
        redundant work.</t>

        <t>A formal network telemetry framework is needed for constructing a
        working system. The framework should cover the concepts and components
        from the standardization perspective. This document clarifies the
        layers on which the telemetry is exerted and decomposes the telemetry
        system into a set of distinct components that the existing and future
        work can easily map to.</t>

<!--      
	<t>By articulating such a framework, we hope it can guide the future
        development where new technologies can fill the gap, the best
        technology can be chosen from the candidates in the same category, and
        the relevant components serving an application can be easily
	identified and assembled.</t>
-->
    </section>

    <section title="Network Telemetry Framework">
      <t>Telemetry can be applied on the data plane, the control plane,
      and the management plane in a network, as well as other sources out of the network, as shown in Figure 1.</t>

      <t><figure anchor="figure_1" title="Layer Category of the Network Telemetry Framework">
          <artwork><![CDATA[
                +------------------------------+
                |                              |
		|       Network Operation      |<-------+
		|          Applications        |        |
                |                              |        |
                +------------------------------+        |
                     ^      ^           ^               |
                     |      |           |               |
                     V      |           V               V
                +-----------|---+--------------+  +-----------+
                |           |   |              |  |           |
                | Control Pl|ane|              |  | External  |
                | Telemetry | <--->            |  | Data and  | 
                |           |   |              |  | Event     |
                |      ^    V   |  Management  |  | Telemetry |
                +------|--------+  Plane       |  |           |
                |      V        |  Telemetry   |  +-----------+
                |               |              |
                | Data Plane  <--->            |
                | Telemetry     |              |
                |               |              |
                +---------------+--------------+

]]></artwork>
        </figure></t>

      <t>Note that the interaction with the network operation applications can be indirect. For
      example, in the management plane telemetry, the management plane may
      need to acquire data from the data plane. On the other hand, an 
      application may involve more than one plane simultaneously. For example,
      an SLA compliance application may require both the data plane telemetry
      and the control plane telemetry.</t>

      <t>At each plane, the telemetry can be further partitioned into five
      distinct components:</t>

      <t><list style="hanging">
          <t hangText="Data Source:">Determine where the original data is
          acquired. The data source usually just provides raw data which needs
          further processing. A data source can be considered a probe. A probe
          can be statically installed or dynamically installed.</t>

          <t hangText="Data Subscription:">Determine the protocol and channel
          for applications to acquire desired data. Data subscription is also
          responsible to define the desired data that might not be directly
          available form data sources. The subscription data can be described by
          a model. The model can be statically installed or dynamically
          installed.</t>

          <t hangText="Data Generation:">The original data needs to be
          processed, encoded, and formatted in network devices to meet
          application subscription requirements. This may involve in-network
          computing and processing on either the fast path or the slow path in
          network devices.</t>

          <t hangText="Data Export:">Determine how the ready data are
          delivered to applications.</t>

          <t hangText="Data Analysis and Storage:">In this final step, data is consumed by
          applications or stored for future reference. Data analysis can be interactive. It may initiate
          further data subscription.</t>
        </list></t>

      <t><figure anchor="figure_2" title="Components in the Network Telemetry Framework">
          <artwork><![CDATA[
                +------------------------------+
                |                              |
		|    Data Analysis/Storage     |
                |                              |         
                +------------------------------+         
                        |               ^                
                        |               |               
                        V               |                
                +---------------+--------------+ 
		|               |              | 
		| Data          | Data         |   
                | Subscription  | Export       |   
                |               |              |   
                +---------------+--------------|   
                |                              |
                |       Data Generation        |
                |                              |
                +------------------------------|
                |                              |
                |       Data Source            |
                |                              |
                +------------------------------+

]]></artwork>
        </figure></t>

      <t>Since most existing standard-related work belongs to the first four components,
      in the remainder of the document, we focus on these components only.</t>

      <section title="Existing Works Mapped in the Framework">
        <t>The following table provides a non-exhaustive list of existing
        works (mainly published in IETF and with the emphasis on the latest
        new technologies) and shows their positions in the framework.</t>

        <t><figure anchor="figure_3" title="Existing Work">
            <artwork><![CDATA[
         +-----------+--------------+---------------+--------------+
         |           | Management   | Control       | Data         |
         |           | Plane        | Plane         | Plane        |
         +-----------+--------------+---------------+--------------+
         |           | YANG Data    | Control Proto.| Flow/Packet  | 
         | Data      | Store        | Network State | Statistics   | 
         | Source    |              |               | States       |  
         |           |              |               | DPI          | 
         +-----------+--------------+---------------+--------------+
         |           | gPRC         | NETCONF/YANG  | NETCONF/YANG | 
         | Data      | YANG PUSH    | BGP           | YANG FSM     |
         | Subscribe |              |               |              |
         |           |              |               |              |
         +-----------+--------------+---------------+--------------+
         |           | Soft DNP     | Soft DNP      | In-situ OAM  | 
         | Data      |              |               | IPFPM        |   
         | Generation|              |               | Hard DNP     | 
         |           |              |               |              |
         +-----------+--------------+---------------+--------------+
         |           | gRPC         | BMP           | IPFIX        |
         | Data      | YANG PUSH    |               | UDP          |  
         | Export    | UDP          |               |              |
         |           |              |               |              |
         +-----------+--------------+---------------+--------------+

]]></artwork>
          </figure></t>
      </section>

      <section title="Management Plane Telemetry">
        <section title="Requirements and Challenges">
          <t>The management plane of the network element interacts with the
          Network Management System (NMS), and provides information such as
          performance data, network logging data, network warning and defects
          data, and network statistics and state data. Some legacy protocols
          are widely used for the management plane, such as SNMP and Syslog,
          but these protocols do not meet the requirements of the automatic
          network operation applications.</t>

          <t>New management plane telemetry protocols should consider the
          following requirements:</t>

          <t><list style="hanging">
              <t hangText="Convenient Data Subscription:">An application
              should have the freedom to choose the data export means such as
              the data types and the export frequency.</t>

              <t hangText="Structured Data:">For automatic network operation,
              machines will replace human for network data comprehension. The
              schema languages such as YANG can efficiently describe
              structured data and normalize data encoding and
              transformation.</t>

              <t hangText="High Speed Data Transport:">In order to retain the
              information, a server needs to send a large amount of data at
              high frequency. Compact encoding formats are needed to compress
              the data and improve the data transport efficiency. The push
              mode, by replacing the poll mode, can also reduce the
              interactions between clients and servers, which help to improve
              the server's efficiency.</t>
            </list></t>
        </section>

        <section title="Push Extensions for NETCONF">
          <t><xref target="RFC6241">NETCONF</xref> is one popular network
          management protocol, which is also recommended by IETF. Although it
          can be used for data collection, NETCONF is good at configurations.
          <xref target="I-D.ietf-netconf-yang-push">YANG Push</xref> extends
          NETCONF and enables subscriber applications to request a continuous,
          customized stream of updates from a YANG datastore. Providing such
          visibility into changes made upon YANG configuration and operational
          objects enables new capabilities based on the remote mirroring of
          configuration and operational state. Moreover, <xref
          target="I-D.zhou-netconf-multi-stream-originators">distributed data
          collection mechanism</xref> via <xref
          target="I-D.ietf-netconf-udp-pub-channel">UDP based publication
          channel</xref> provides enhanced efficiency for the NETCONF based
          telemetry.</t>
        </section>

        <section title="gRPC Network Management Interface">
          <t><xref target="I-D.openconfig-rtgwg-gnmi-spec">gRPC Network
          Management Interface (gNMI)</xref> is a network management protocol
          based on the <xref
          target="I-D.kumar-rtgwg-grpc-protocol">gRPC</xref> RPC (Remote
          Procedure Call) framework. With a single gRPC service definition,
          both configuration and telemetry can be covered. gRPC is an <xref
          target="RFC7540">HTTP/2</xref> based open source micro service
          communication framework. It provides a number of capabilities that
          makes it well-suited for network telemetry, including:</t>

          <t><list style="symbols">
              <t>Full-duplex streaming transport model combined with a
              binary encoding mechanism provided further improved telemetry
              efficiency.</t>

              <t>gRPC provides higher-level features consistency across
              platforms that common HTTP/2 libraries typically do not. This
              characteristic is especially valuable for the fact that
              telemetry data collectors normally reside on a large
              variety of platforms.</t>

              <t>The built-in load-balancing and failover mechanism.</t>
            </list></t>
        </section>
      </section>

      <section title="Control Plane Telemetry">
        <section title="Requirements and Challenges">
	   <t>The control plane telemetry refers to the health condition monitoring of different network protocols, 
		   which covers Layer 2 to Layer 7. Keeping track of the running status of these protocols is beneficial for detecting, localizing, 
		   and even predicting various network issues, as well as network optimization, in real-time and in fine granularity.
	   </t>
	   <t>One of the most challenging problems for the control plane telemetry is how to correlate the E2E Key Performance Indicators (KPI) 
		   to a specific layer's KPIs. For example, an IPTV user may describe his User Experience (UE) by the video fluency and definition. 
		   Then in case of an unusually poor UE KPI or a service disconnection, it is non-trivial work to delimit and localize the issue 
		   to the responsible protocol layer (e.g., the Transport Layer or the Network Layer), the responsible protocol 
		   (e.g., ISIS or BGP at the Network Layer), and finally the responsible device(s) with specific reasons.
	   </t>
	   <t> Traditional OAM-based approaches for control plane KPI measurement include PING (L3), Tracert (L3), Y.1731 (L2) and so on. 
		   One common issue behind these methods is that they only measure the KPIs instead of reflecting the actual running status of these protocols, 
		   making them less effective or efficient for control plane troubleshooting and network optimization. 
		   An example of the control plane telemetry is the BGP monitoring protocol (BMP), it is currently used to monitoring the BGP routes 
		   and enables rich applications, such as BGP peer analysis, AS analysis, prefix analysis, security analysis, and so on. 
		   However, the monitoring of other layers, protocols and the cross-layer, cross-protocol KPI correlations are still in their infancy
		   (e.g., the IGP monitoring is missing), which require substantial further research.
  	   </t>
        </section>

        <section title="BGP Monitoring Protocol">
          <t><xref target="RFC7854">BGP Monitoring Protocol (BMP)</xref> is
          used to monitor BGP sessions and intended to provide a convenient
	  interface for obtaining route views.</t>
          <t> 
	  The BGP routing information is collected from the monitored device(s) to the BMP monitoring station by setting up the BMP TCP session. 
	  The BGP peers are monitored by the BMP Peer Up and Peer Down Notifications. 
	  The BGP routes (including <xref target="RFC7854"> Adjacency_RIB_In </xref>, <xref target = "I-D.ietf-grow-bmp-adj-rib-out"> 
		  Adjacency_RIB_out</xref>, and <xref target="I-D.ietf-grow-bmp-local-rib">Local_Rib</xref> are encapsulated in the BMP Route Monitoring Message 
	          and the BMP Route Mirroring Message, in the form of both initial table dump and real-time route update. 
		  In addition, BGP statistics are reported through the BMP Stats Report Message, which could be either timer triggered or event driven. 
		  More BMP extensions can be explored to enrich the applications of BGP monitoring. 
          </t>
        </section>
      </section>

      <section title="Data Plane Telemetry">
        <section title="Requirements and Challenges">
          <t>An effective data plane telemetry system relies on the data that
          the network device can expose. The data's quality, quantity, and
          timeliness must meet some stringent requirements. This raises some
          challenges to the network data plane devices where the first hand
          data originate.</t>

          <t><list style="symbols">
              <t>A data plane device's main function is user traffic
              processing and forwarding. While supporting network visibility
              is important, the telemetry is just an auxiliary function and it
              should not impede normal traffic processing and forwarding
              (i.e., the performance is not lowered and the behavior is not
              altered due to the telemetry functions).</t>

              <t>The network operation applications requires end-to-end visibility
              from various sources, which results in a huge volume of data.
              However, the sheer data quantity should not stress the network
              bandwidth, regardless of the data delivery approach (i.e.,
              through in-band or out-of-band channels).</t>

              <t>The data plane devices must provide the data in a timely
              manner with the minimum possible delay. Long processing,
              transport, storage, and analysis delay can impact the
              effectiveness of the control loop and even render the data
              useless.</t>

              <t>The data should be structured and labeled, and easy for
              applications to parse and consume. At the same time, the data
              types needed by applications can vary significantly. The data
              plane devices need to provide enough flexibility and
              programmability to support the precise data provision for
              applications.</t>

              <t>The data plane telemetry should support incremental
              deployment and work even though some devices are unaware of the
              system. This challenge is highly relevant to the standards and
              legacy networks.</t>
	    </list></t>

	    <t>The industry has agreed that the data plane programmability is essential 
	     to support network telemetry. Newer data plane chips are all equipped 
	     with advanced telemetry features and provide flexibility to support 
	     customized telemetry functions.
	    </t>
        </section>

        <section title = "Technique Taxonomy">
		
	    <t>There can be multiple possible dimensions to classify the data plane telemetry techniques.</t>

            <t><list style="hanging">
                <t hangText="Active and Passive:">
		     The active and passive methods (as well as the hybrid types) are well documented in <xref target="RFC7799"></xref>.
		     The passive methods include TCPDUMP, <xref target="RFC7011">IPFIX</xref>, sflow, and traffic mirror. These methods usually have low data coverage. 
		     The bandwidth cost is very high in order to improve the data coverage. On the other hand, the active methods 
		     include Ping, Traceroute, <xref target="RFC4656">OWAMP</xref>, and <xref target="RFC5357">TWAMP</xref>. 
		     These methods are intrusive and only provide indirect network measurement results. 
		     The hybrid methods, including <xref target="I-D.brockners-inband-oam-requirements">in-situ OAM</xref>, <xref target="RFC8321">IPFPM</xref>, 
		     and <xref target="I-D.fioccola-ippm-multipoint-alt-mark">Multipoint Alternate Marking</xref>, 
		     provide a well-balanced and more flexible approach. However, these
		     methods are also more complex to implement.   
	        </t>
		<t hangText="In-Band and Out-of-Band:">
		     The telemetry data, before being exported to some collector, can be carried in user packets. 
		     Such methods are considered in-band (e.g., <xref target="I-D.brockners-inband-oam-requirements">in-situ OAM</xref>). 
		     If the telemetry data is directly exported to some collector without modifying the user packets,
		     Such methods are considered out-of-band (e.g., postcard-based INT). 
		     It is possible to have hybrid methods. 
		     For example, only the telemetry instruction or partial data is carried by user packets (e.g., <xref target="RFC8321">IPFPM</xref>). 
                </t>

		<t hangText="E2E and In-Network:">
		     Some E2E methods start from and end at the network end hosts (e.g., Ping). The other methods work in networks and are transparent to 
                     end hosts. However, if needed, the in-network methods can be easily extended into end hosts.
		</t>

		<t hangText="Flow, Path, and Node:">
			Depending on the telemetry objective, the methods can be flow-based (e.g., <xref target="I-D.brockners-inband-oam-requirements">in-situ OAM</xref>), 
			path-based (e.g., Traceroute), 
		     and node-based (e.g., <xref target="RFC7011">IPFIX</xref>).	
		</t>	

            </list></t>		

	</section>	

	
        <section title="The IPFPM technology">
	  <t>The Alternate Marking method is efficient to perform packet loss, delay, and jitter measurements 
	  both in an IP and Overlay Networks, as presented in 
	  <xref target="RFC8321">IPFPM</xref> and <xref target="I-D.fioccola-ippm-multipoint-alt-mark"/>.</t> 
	  
	  <t>This technique can be applied to point-to-point and multipoint-to-multipoint flows.
	  Alternate Marking creates batches of packets by alternating the value of 1 bit (or a label) of the packet header. 
	  These batches of packets are unambiguously recognized over the network and the comparison of packet counters 
	  for each batch allows the packet loss calculation. The same idea can be applied to delay measurement 
	  by selecting ad hoc packets with a marking bit dedicated for delay measurements.</t>
	  
	  <t>Alternate Marking method needs two counters each marking period for each flow under monitor.
	  For instance, by considering n measurement points and m monitored flows, the order of magnitude of the packet 
	  counters for each time interval is n*m*2 (1 per color).</t>
	  
	  <t>Since networks offer rich sets of network performance measurement data (e.g packet counters), 
	  traditional approaches run into limitations. One reason is the fact that the bottleneck is 
	  the generation and export of the data and the amount of data that can be reasonably collected 
	  from the network. In addition, management tasks related to determining and configuring which data 
	  to generate lead to significant deployment challenges.</t>
	  
	  <t>Multipoint Alternate Marking approach, described in <xref target="I-D.fioccola-ippm-multipoint-alt-mark"/>, 
	  aims to resolve this issue and makes the performance monitoring more flexible in case a detailed analysis is not needed.</t>
	  
	  <t>An application orchestrates network performance measurements tasks across the network 
	  to allow an optimized monitoring and it can calibrate how deep can be obtained monitoring data from the network 
	  by configuring measurement points roughly or meticulously.</t>
	  
	  <t>Using Alternate Marking, it is possible to monitor a Multipoint Network without examining in depth by using 
	  the Network Clustering (subnetworks that are portions of the entire network that preserve the same property of 
	  the entire network, called clusters). So in case there is packet loss or the delay is too high the filtering criteria 
	  could be specified more in order to perform a detailed analysis by using a different combination of clusters up to a 
	  per-flow measurement as described in <xref target="RFC8321">IPFPM</xref>.</t>
	  
	  <t>In summary, an application can configure initially an end to end monitoring between ingress 
	  points and egress points of the network. If the network does not experiment issues, this approximate 
	  monitoring is good enough and is very cheap in terms of network resources. But, in case of problems, 
	  the application becomes aware of the issues from this approximate monitoring and, in order to localize 
	  the portion of the network that has issues, configures the measurement points more exhaustively. So a new 
	  detailed monitoring is performed. After the detection and resolution of the problem the initial approximate 
	  monitoring can be used again.</t>
	  
  <!--	  <t>This idea is general and can be applied to different performance measurements techniques, but in particular to
	  Alternate Marking.</t>
  -->
	</section>
		
        <section title="Dynamic Network Probe">
          <t>Hardware based <xref target="I-D.song-opsawg-dnp4iq">Dynamic
          Network Probe (DNP)</xref> provides a programmable means to
          customize the data that an application collects from the data plane.
          A direct benefit of DNP is the reduction of the exported data. A
          full DNP solution covers several components including data source,
          data subscription, and data generation. The data subscription needs
          to define the custom data which can be composed and derived from the
          raw data sources. The data generation takes advantage of the
          moderate in-network computing to produce the desired data.</t>

          <t>While DNP can introduce unforeseeable flexibility to the data
          plane telemetry, it also faces some challenges. It requires a
          flexible data plane that can be dynamically reprogrammed at run-time.
          The programming API is yet to be defined.</t>
        </section>

        <section title="IP Flow Information Export (IPFIX) protocol">
          <t>Traffic on a network can be seen as a set of flows passing
          through network elements. <xref target="RFC7011">IP Flow Information
          Export (IPFIX) </xref> provides a means of transmitting traffic flow
          information for administrative or other purposes. A typical IPFIX
          enabled system includes a pool of Metering Processes collects data
          packets at one or more Observation Points, optionally filters them
          and aggregates information about these packets. An Exporter then
          gathers each of the Observation Points together into an Observation
          Domain and sends this information via the IPFIX protocol to a
          Collector.</t>
        </section>

        <section title="In-Situ OAM">
          <t>Traditional passive and active monitoring and measurement
          techniques are either inaccurate or resource-consuming. It is
          preferable to directly acquire data associated with a flow's packets
          when the packets pass through a network. <xref
          target="I-D.brockners-inband-oam-requirements">In-situ OAM
          (iOAM)</xref>, a data generation technique, embeds a new instruction
          header to user packets and the instruction directs the network nodes
          to add the requested data to the packets. Thus, at the path end the
          packet's experience on the entire forwarding path can be collected.
          Such firsthand data is invaluable to many network OAM
          applications.</t>

          <t>However, iOAM also faces some challenges. The issues on
          performance impact, security, scalability and overhead limits,
          encapsulation difficulties in some protocols, and cross-domain
          deployment need to be addressed.</t>
	</section>

      </section>

      <section title="External Data and Event Telemetry">
	      <t>Events that occur outside the boundaries of the network system are another important source of telemetry information. 
		 Correlating both internal telemetry data and external events with the requirements of network systems, 
		 as presented in <xref target="I-D.pedro-nmrg-anticipated-adaptation">Exploiting External Event Detectors to 
	         Anticipate Resource Requirements for the Elastic Adaptation of SDN/NFV Systems</xref>, 
	         provides a strategic and functional advantage to management operations.</t>
	 
	  <section title="Requirements and Challenges">
		  <t>As with other sources of telemetry information, the data and events must meet strict requirements, 
	             especially in terms of timeliness, which is essential to properly incorporate external event information to management cycles. 
		     Thus, the specific challenges  are described as follows:</t>
               
	       <t><list style="symbols">
                  <t>The role of external event detector can be played by multiple elements, including hardware 
			  (e.g. physical sensors, such as seismometers) and software (e.g. Big Data sources that analyze 
			  streams of information, such as Twitter messages). Thus, the transmitted data must support different 
			  shapes but, at the same time, follow a common but extensible ontology.
		  </t>
		  <t>Since the main function of the external event detectors is actually to perform the notifications, 
			  their timeliness is assumed. However, once messages have been dispatched, they must be quickly 
			  collected and inserted into the control plane with variable priority, which will be high for important 
			  sources and/or important events and low for secondary ones.
		  </t>
                  <t>The ontology used by external detectors must be easily adopted by current and future devices and applications. 
			  Therefore, it must be easily mapped to current information models, such as in terms of YANG.
		  </t>
	      </list></t>
	      <t>Organizing together both internal and external telemetry information will be key for the general exploitation of the 
		      management possibilities of current and future network systems, as reflected in the incorporation of cognitive 
		      capabilities to new hardware and software (virtual) elements.
	      </t>
          </section>
        </section>

    </section>

    <section anchor="level" title="Evolution of Network Telemetry">

	    <t>As the network is evolving towards the automated operation, network telemetry also undergoes several levels of evolution.</t> 

	    <t><list style="hanging">
			    <t hangText="Level 0 - Static Telemetry:">
                                The telemetry data is determined at design time. The network operator can only configure how to use it with limited flexibility.
			    </t>
			    <t hangText="Level 1 - Dynamic Telemetry:">
				The telemetry data can be dynamically programmed or configured at runtime, allowing a tradeoff among resource, performance, flexibility, and coverage. 
				    DNP is an effort towards this direction.  
			    </t>
			    <t hangText="Level 2 - Interactive Telemetry:">
				    The network operator can continuously customize the telemetry data in real time to reflect the network operation's visibility requirements. 
				    At this level, some tasks can be automated but human operators still need to sit in the middle to make decisions.      
			    </t>
			    <t hangText="Level 3 - Closed-loop Telemetry:">
				Human operators are completely excluded from the control loop. The intelligent network operation engine automatically issues the telemetry data request, 
				analyzes the data, and updates the network operations in closed control loops.      
			    </t>		    
	    </list></t>

	    <t>While most of the existing technologies belong to level 0 and level 1, with the help of a clearly defined network telemetry framework, 
		    we can assemble the technologies to support level 2 and make solid steps towards level 3. </t>  
 
    </section>

    <section anchor="Security" title="Security Considerations">
      <t>TBD</t>
    </section>

    <section anchor="IANA" title="IANA Considerations">
      <t>This document includes no request to IANA.</t>
    </section>
    
    <section anchor="Contributors" title="Contributors">
      <t>
        The other main contributors of this document are listed as follows.
      </t><t>
      <list style="symbols">
	      <t>
		      James N. Guichard, Huawei
	      </t>	      
	      <t>
		      Yunan Gu, Huawei
	      </t>	      
      </list>	  	
      </t>
    </section>

    <section anchor="Acknowledgments" title="Acknowledgments">
      <t>We would like to thank Victor Liu and others who have provided helpful comments and suggestions to improve this document.</t>
    </section>
  </middle>

  <back>
    <references title="Normative References">
      <?rfc include='reference.RFC.2119'?>
      <?rfc include='reference.RFC.8174'?>
    </references>

    <references title="Informative References">
      <?rfc include="reference.RFC.6241"?>
      <?rfc include='reference.RFC.7540'?>
      <?rfc include='reference.RFC.7854'?>
      <?rfc include='reference.RFC.8321'?>
      <?rfc include='reference.RFC.7011'?>
      <?rfc include='reference.RFC.4656'?>
      <?rfc include='reference.RFC.5357'?>
      <?rfc include='reference.RFC.1157'?>
      <?rfc include='reference.RFC.7276'?>
      <?rfc include='reference.RFC.7799'?>
      <?rfc include='reference.I-D.ietf-grow-bmp-adj-rib-out'?>
      <?rfc include='reference.I-D.ietf-grow-bmp-local-rib'?>
      <?rfc include='reference.I-D.ietf-netconf-yang-push'?>
      <?rfc include='reference.I-D.zhou-netconf-multi-stream-originators'?>
      <?rfc include='reference.I-D.ietf-netconf-udp-pub-channel'?>
      <?rfc include='reference.I-D.openconfig-rtgwg-gnmi-spec'?>
      <?rfc include='reference.I-D.kumar-rtgwg-grpc-protocol'?>
      <?rfc include='reference.I-D.song-opsawg-dnp4iq'?>
      <?rfc include='reference.I-D.brockners-inband-oam-requirements'?>
      <?rfc include='reference.I-D.fioccola-ippm-multipoint-alt-mark'?>
      <?rfc include='reference.I-D.pedro-nmrg-anticipated-adaptation'?>
    </references>
  </back>
</rfc>
