Index: e1000_defines.h =================================================================== RCS file: /home/ncvs/src/sys/dev/em/e1000_defines.h,v retrieving revision 1.3 diff -u -p -r1.3 e1000_defines.h --- e1000_defines.h 16 May 2007 00:14:23 -0000 1.3 +++ e1000_defines.h 3 Oct 2007 21:36:07 -0000 @@ -746,7 +746,6 @@ */ #define IMS_ENABLE_MASK ( \ E1000_IMS_RXT0 | \ - E1000_IMS_TXDW | \ E1000_IMS_RXDMT0 | \ E1000_IMS_RXSEQ | \ E1000_IMS_LSC) Index: if_em.c =================================================================== RCS file: /home/ncvs/src/sys/dev/em/if_em.c,v retrieving revision 1.184 diff -u -p -r1.184 if_em.c --- if_em.c 10 Sep 2007 21:50:40 -0000 1.184 +++ if_em.c 3 Oct 2007 21:41:12 -0000 @@ -240,14 +240,16 @@ static void em_initialize_transmit_unit( static int em_setup_receive_structures(struct adapter *); static void em_initialize_receive_unit(struct adapter *); static void em_enable_intr(struct adapter *); +static void em_enable_intr_rx(struct adapter *); static void em_disable_intr(struct adapter *); +static void em_disable_intr_rx(struct adapter *); static void em_free_transmit_structures(struct adapter *); static void em_free_receive_structures(struct adapter *); static void em_update_stats_counters(struct adapter *); static void em_txeof(struct adapter *); static int em_allocate_receive_structures(struct adapter *); static int em_allocate_transmit_structures(struct adapter *); -static int em_rxeof(struct adapter *, int); +static int em_rxeof(struct adapter *, int, int); #ifndef __NO_STRICT_ALIGNMENT static int em_fixup_rx(struct adapter *); #endif @@ -292,14 +294,19 @@ static void em_get_hw_control(struct static void em_release_hw_control(struct adapter *); static void em_enable_wakeup(device_t); + +/* + * Fast interrupt handler and legacy ithread/polling modes are + * mutually exclusive. + */ #ifdef DEVICE_POLLING static poll_handler_t em_poll; static void em_intr(void *); #else +static void em_add_int_rx_kthread_priority(struct adapter *, const char *, + const char *, int *, int); static int em_intr_fast(void *); -static void em_add_rx_process_limit(struct adapter *, const char *, - const char *, int *, int); -static void em_handle_rxtx(void *context, int pending); +static void em_kthread_rx(void *arg); static void em_handle_link(void *context, int pending); #endif @@ -351,9 +358,8 @@ TUNABLE_INT("hw.em.rxd", &em_rxd); TUNABLE_INT("hw.em.txd", &em_txd); TUNABLE_INT("hw.em.smart_pwr_down", &em_smart_pwr_down); #ifndef DEVICE_POLLING -/* How many packets rxeof tries to clean at a time */ -static int em_rx_process_limit = 100; -TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit); +static int em_rx_kthread_priority = PRI_MAX_KERN; +TUNABLE_INT("hw.em.rx_kthread_priority", &em_rx_kthread_priority); #endif /* Global used in WOL setup with multiport cards */ static int global_quad_port_a = 0; @@ -370,7 +376,7 @@ static int global_quad_port_a = 0; static int em_probe(device_t dev) { - char adapter_name[60]; + char adapter_name[1024]; /* XXX why? */ uint16_t pci_vendor_id = 0; uint16_t pci_device_id = 0; uint16_t pci_subvendor_id = 0; @@ -431,7 +437,8 @@ em_attach(device_t dev) adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; - EM_LOCK_INIT(adapter, device_get_nameunit(dev)); + EM_RXLOCK_INIT(adapter, device_get_nameunit(dev)); + EM_TXLOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL stuff */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), @@ -444,8 +451,8 @@ em_attach(device_t dev) OID_AUTO, "stats", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, em_sysctl_stats, "I", "Statistics"); - callout_init_mtx(&adapter->timer, &adapter->mtx, 0); - callout_init_mtx(&adapter->tx_fifo_timer, &adapter->mtx, 0); + callout_init_mtx(&adapter->timer, &adapter->txmtx, 0); + callout_init_mtx(&adapter->tx_fifo_timer, &adapter->txmtx, 0); /* Determine hardware and mac info */ em_identify_hardware(adapter); @@ -506,10 +513,10 @@ em_attach(device_t dev) } #ifndef DEVICE_POLLING - /* Sysctls for limiting the amount of work done in the taskqueue */ - em_add_rx_process_limit(adapter, "rx_processing_limit", - "max number of rx packets to process", &adapter->rx_process_limit, - em_rx_process_limit); + /* Sysctls for set the RX kthreads' priority */ + em_add_int_rx_kthread_priority(adapter, "rx_kthread_priority", + "priority of RX handler kthread", &adapter->rx_kthread_priority, + em_rx_kthread_priority); #endif /* @@ -517,25 +524,14 @@ em_attach(device_t dev) * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ - if (((em_txd * sizeof(struct e1000_tx_desc)) % EM_DBA_ALIGN) != 0 || - (adapter->hw.mac.type >= e1000_82544 && em_txd > EM_MAX_TXD) || - (adapter->hw.mac.type < e1000_82544 && em_txd > EM_MAX_TXD_82543) || - (em_txd < EM_MIN_TXD)) { - device_printf(dev, "Using %d TX descriptors instead of %d!\n", - EM_DEFAULT_TXD, em_txd); - adapter->num_tx_desc = EM_DEFAULT_TXD; - } else - adapter->num_tx_desc = em_txd; - if (((em_rxd * sizeof(struct e1000_rx_desc)) % EM_DBA_ALIGN) != 0 || - (adapter->hw.mac.type >= e1000_82544 && em_rxd > EM_MAX_RXD) || - (adapter->hw.mac.type < e1000_82544 && em_rxd > EM_MAX_RXD_82543) || - (em_rxd < EM_MIN_RXD)) { - device_printf(dev, "Using %d RX descriptors instead of %d!\n", - EM_DEFAULT_RXD, em_rxd); - adapter->num_rx_desc = EM_DEFAULT_RXD; - } else - adapter->num_rx_desc = em_rxd; - + if (adapter->hw.mac.type >= e1000_82544) { + adapter->num_tx_desc = EM_MAX_TXD; + adapter->num_rx_desc = EM_MAX_RXD; + } else { + adapter->num_tx_desc = EM_MAX_TXD_82543; + adapter->num_rx_desc = EM_MAX_RXD_82543; + } + adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.wait_for_link = FALSE; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; @@ -736,7 +732,9 @@ err_tx_desc: err_pci: em_free_intr(adapter); em_free_pci_resources(adapter); - EM_LOCK_DESTROY(adapter); + /* XXX */ + EM_TXLOCK_DESTROY(adapter); + EM_RXLOCK_DESTROY(adapter); return (error); } @@ -766,7 +764,8 @@ em_detach(device_t dev) em_disable_intr(adapter); em_free_intr(adapter); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); adapter->in_detach = 1; em_stop(adapter); e1000_phy_hw_reset(&adapter->hw); @@ -785,7 +784,8 @@ em_detach(device_t dev) em_enable_wakeup(dev); } - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); @@ -811,7 +811,8 @@ em_detach(device_t dev) adapter->rx_desc_base = NULL; } - EM_LOCK_DESTROY(adapter); + EM_TXLOCK_DESTROY(adapter); + EM_RXLOCK_DESTROY(adapter); return (0); } @@ -836,7 +837,8 @@ em_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_stop(adapter); em_release_manageability(adapter); @@ -853,7 +855,8 @@ em_suspend(device_t dev) em_enable_wakeup(dev); } - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return bus_generic_suspend(dev); } @@ -864,7 +867,8 @@ em_resume(device_t dev) struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_init_locked(adapter); em_init_manageability(adapter); @@ -872,7 +876,8 @@ em_resume(device_t dev) (ifp->if_drv_flags & IFF_DRV_RUNNING)) em_start_locked(ifp); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return bus_generic_resume(dev); } @@ -894,7 +899,7 @@ em_start_locked(struct ifnet *ifp) struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; - EM_LOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) @@ -906,7 +911,7 @@ em_start_locked(struct ifnet *ifp) IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) - break; + continue; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. @@ -926,7 +931,12 @@ em_start_locked(struct ifnet *ifp) ETHER_BPF_MTAP(ifp, m_head); /* Set timeout in case hardware has problems transmitting. */ - adapter->watchdog_timer = EM_TX_TIMEOUT; + adapter->tx_counter ++; + } + + if (adapter->num_tx_desc - adapter->num_tx_desc_avail > 32) { + /* it's time to clean a little bit */ + em_txeof (adapter); } } @@ -935,10 +945,10 @@ em_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; - EM_LOCK(adapter); + EM_TXLOCK(adapter); if (ifp->if_drv_flags & IFF_DRV_RUNNING) em_start_locked(ifp); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); } /********************************************************************* @@ -973,9 +983,11 @@ em_ioctl(struct ifnet *ifp, u_long comma */ ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_init_locked(adapter); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } arp_ifinit(ifp, ifa); } else @@ -988,7 +1000,8 @@ em_ioctl(struct ifnet *ifp, u_long comma IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); switch (adapter->hw.mac.type) { case e1000_82573: /* @@ -1019,7 +1032,8 @@ em_ioctl(struct ifnet *ifp, u_long comma } if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); error = EINVAL; break; } @@ -1028,13 +1042,15 @@ em_ioctl(struct ifnet *ifp, u_long comma adapter->hw.mac.max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; em_init_locked(adapter); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & @@ -1048,13 +1064,15 @@ em_ioctl(struct ifnet *ifp, u_long comma if (ifp->if_drv_flags & IFF_DRV_RUNNING) em_stop(adapter); adapter->if_flags = ifp->if_flags; - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_disable_intr(adapter); em_set_multi(adapter); if (adapter->hw.mac.type == e1000_82542 && @@ -1065,19 +1083,23 @@ em_ioctl(struct ifnet *ifp, u_long comma if (!(ifp->if_capenable & IFCAP_POLLING)) #endif em_enable_intr(adapter); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); @@ -1096,17 +1118,21 @@ em_ioctl(struct ifnet *ifp, u_long comma error = ether_poll_register(em_poll, ifp); if (error) return (error); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } } #endif @@ -1149,29 +1175,49 @@ static void em_watchdog(struct adapter *adapter) { - EM_LOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); - /* - ** The timer is set to 5 every time start queues a packet. - ** Then txeof keeps resetting to 5 as long as it cleans at - ** least one descriptor. - ** Finally, anytime all descriptors are clean the timer is - ** set to 0. - */ - if (adapter->watchdog_timer == 0 || --adapter->watchdog_timer) - return; + if (E1000_READ_REG(&adapter->hw, E1000_TDH) == + E1000_READ_REG(&adapter->hw, E1000_TDT)) { + /* TX queue is clean. Nothing to wait */ + adapter->tx_counter_watchdog_mark = 0; + } /* If we are in this routine because of pause frames, then * don't reset the hardware. */ if (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_TXOFF) { - adapter->watchdog_timer = EM_TX_TIMEOUT; + /* XOFF received */ + adapter->tx_counter_watchdog_mark = 0; + return; + } + + if (!adapter->tx_counter_watchdog_mark) { + /* watchdog isn't started yet, let's do it */ + adapter->tx_counter_watchdog_mark = adapter->tx_counter; + adapter->tx_tdh_watchdog_mark = E1000_READ_REG(&adapter->hw, E1000_TDH); + return; + } + + if (adapter->tx_counter - adapter->tx_counter_watchdog_mark >= adapter->num_tx_desc) { + /* TX ring has been wrapped, clean watchdog condition */ + adapter->tx_counter_watchdog_mark = 0; return; } - if (e1000_check_for_link(&adapter->hw) == 0) + if (adapter->tx_tdh_watchdog_mark != E1000_READ_REG(&adapter->hw, E1000_TDH)) { + /* Something were sent */ + adapter->tx_counter_watchdog_mark = 0; + return; + } + + if (e1000_check_for_link(&adapter->hw) == 0) { device_printf(adapter->dev, "watchdog timeout -- resetting\n"); + em_print_hw_stats(adapter); + em_print_debug_info(adapter); + } + adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; @@ -1198,7 +1244,8 @@ em_init_locked(struct adapter *adapter) INIT_DEBUGOUT("em_init: begin"); - EM_LOCK_ASSERT(adapter); + EM_RXLOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); em_stop(adapter); @@ -1337,9 +1384,11 @@ em_init(void *arg) { struct adapter *adapter = arg; - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); em_init_locked(adapter); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } @@ -1355,9 +1404,11 @@ em_poll(struct ifnet *ifp, enum poll_cmd struct adapter *adapter = ifp->if_softc; uint32_t reg_icr; - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return; } @@ -1372,12 +1423,13 @@ em_poll(struct ifnet *ifp, enum poll_cmd em_local_timer, adapter); } } - em_rxeof(adapter, count); + em_rxeof(adapter, count, 0); em_txeof(adapter); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) em_start_locked(ifp); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } /********************************************************************* @@ -1393,11 +1445,11 @@ em_intr(void *arg) struct ifnet *ifp; uint32_t reg_icr; - EM_LOCK(adapter); + /* XXX EM_LOCK(adapter); */ ifp = adapter->ifp; if (ifp->if_capenable & IFCAP_POLLING) { - EM_UNLOCK(adapter); + /* EM_UNLOCK(adapter); */ return; } @@ -1419,29 +1471,35 @@ em_intr(void *arg) if (reg_icr == 0xffffffff) break; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - em_rxeof(adapter, -1); - em_txeof(adapter); - } - /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); callout_stop(&adapter->timer); adapter->hw.mac.get_link_status = 1; e1000_check_for_link(&adapter->hw); em_update_link_status(adapter); callout_reset(&adapter->timer, hz, em_local_timer, adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); + } + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (reg_icr & (E1000_ICR_RXDMT0|E1000_ICR_RXO|E1000_ICR_RXT0)) { + EM_RXLOCK(adapter); + em_rxeof(adapter, -1,0); + EM_RXUNLOCK(adapter); + } + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + EM_TXLOCK(adapter); + em_start_locked(ifp); + EM_TXUNLOCK(adapter); + } } if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; } - - if (ifp->if_drv_flags & IFF_DRV_RUNNING && - !IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - em_start_locked(ifp); - EM_UNLOCK(adapter); } #else /* if not DEVICE_POLLING, then fast interrupt routines only */ @@ -1454,9 +1512,11 @@ em_handle_link(void *context, int pendin ifp = adapter->ifp; - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return; } @@ -1465,33 +1525,37 @@ em_handle_link(void *context, int pendin e1000_check_for_link(&adapter->hw); em_update_link_status(adapter); callout_reset(&adapter->timer, hz, em_local_timer, adapter); - EM_UNLOCK(adapter); + + wakeup (&adapter->rxmtx); + wakeup (&adapter->txmtx); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } static void -em_handle_rxtx(void *context, int pending) +em_kthread_rx(void *arg) { - struct adapter *adapter = context; - struct ifnet *ifp; + struct adapter *adapter = arg; + struct ifnet *ifp = adapter->ifp; + int myKthreadNo = 0; - ifp = adapter->ifp; + EM_RXLOCK(adapter); + myKthreadNo = adapter -> rxKthreadNo ++; + adapter -> rxIpBeingProcessed[myKthreadNo] = 0; + adapter -> waitedBy[myKthreadNo] = 0; + EM_RXUNLOCK(adapter); - /* - * TODO: - * It should be possible to run the tx clean loop without the lock. - */ - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - if (em_rxeof(adapter, adapter->rx_process_limit) != 0) - taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); - EM_LOCK(adapter); - em_txeof(adapter); - - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - em_start_locked(ifp); - EM_UNLOCK(adapter); + while (!adapter->rx_shutdown_flag) { + tsleep(&adapter->rxmtx, adapter->rx_kthread_priority, "em_rx", hz); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + EM_RXLOCK(adapter); + em_rxeof(adapter,-1, myKthreadNo); + EM_RXUNLOCK(adapter); + } + em_enable_intr_rx(adapter); } - em_enable_intr(adapter); + kthread_exit(0); } /********************************************************************* @@ -1526,13 +1590,17 @@ em_intr_fast(void *arg) (reg_icr & E1000_ICR_INT_ASSERTED) == 0) return (FILTER_STRAY); - /* - * Mask interrupts until the taskqueue is finished running. This is - * cheap, just assume that it is needed. This also works around the - * MSI message reordering errata on certain systems. - */ - em_disable_intr(adapter); - taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (reg_icr & (E1000_ICR_RXDMT0|E1000_ICR_RXO|E1000_ICR_RXT0)) { + /* + * Mask interrupts until the taskqueue is finished running. This is + * cheap, just assume that it is needed. This also works around the + * MSI message reordering errata on certain systems. + */ + em_disable_intr_rx (adapter); + wakeup (&adapter->rxmtx); + } + } /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) @@ -1560,7 +1628,8 @@ em_media_status(struct ifnet *ifp, struc INIT_DEBUGOUT("em_media_status: begin"); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); e1000_check_for_link(&adapter->hw); em_update_link_status(adapter); @@ -1568,7 +1637,8 @@ em_media_status(struct ifnet *ifp, struc ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return; } @@ -1596,7 +1666,8 @@ em_media_status(struct ifnet *ifp, struc else ifmr->ifm_active |= IFM_HDX; } - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } /********************************************************************* @@ -1618,7 +1689,8 @@ em_media_change(struct ifnet *ifp) if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; @@ -1656,7 +1728,8 @@ em_media_change(struct ifnet *ifp) adapter->hw.phy.reset_disable = FALSE; em_init_locked(adapter); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return (0); } @@ -2130,7 +2203,8 @@ em_82547_move_tail(void *arg) uint16_t length = 0; boolean_t eop = 0; - EM_LOCK_ASSERT(adapter); + EM_RXLOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); hw_tdt = E1000_READ_REG(&adapter->hw, E1000_TDT); sw_tdt = adapter->next_avail_tx_desc; @@ -2337,7 +2411,8 @@ em_local_timer(void *arg) struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; - EM_LOCK_ASSERT(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); e1000_check_for_link(&adapter->hw); em_update_link_status(adapter); @@ -2359,6 +2434,9 @@ em_local_timer(void *arg) em_watchdog(adapter); callout_reset(&adapter->timer, hz, em_local_timer, adapter); + + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); } static void @@ -2419,7 +2497,8 @@ em_stop(void *arg) struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; - EM_LOCK_ASSERT(adapter); + EM_RXLOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); INIT_DEBUGOUT("em_stop: begin"); @@ -2606,19 +2685,22 @@ em_allocate_intr(struct adapter *adapter * Try allocating a fast interrupt and the associated deferred * processing contexts. */ - TASK_INIT(&adapter->rxtx_task, 0, em_handle_rxtx, adapter); - TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter); - adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT, - taskqueue_thread_enqueue, &adapter->tq); - taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s taskq", - device_get_nameunit(adapter->dev)); + TASK_INIT(&adapter->link_task, INTR_TYPE_NET | INTR_MPSAFE, em_handle_link, adapter); + + adapter->rx_shutdown_flag=FALSE; + adapter->rxKthreadNo=0; + adapter->reorder_cnt=0; + for (int i = 0; i < RX_KTHREADS_NUM; i++) { + adapter->rx_kthreads_handles[i] = NULL; + kthread_create (em_kthread_rx, adapter, adapter->rx_kthreads_handles + i, + INTR_TYPE_NET | INTR_FAST | INTR_MPSAFE, 0, "%s_rx_kthread_%d",device_get_nameunit(dev),i); + } + if ((error = bus_setup_intr(dev, adapter->res_interrupt, - INTR_TYPE_NET, em_intr_fast, NULL, adapter, + INTR_TYPE_NET | INTR_FAST | INTR_MPSAFE, em_intr_fast, NULL, adapter, &adapter->int_handler_tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); - taskqueue_free(adapter->tq); - adapter->tq = NULL; return (error); } #endif @@ -2637,11 +2719,12 @@ em_free_intr(struct adapter *adapter) adapter->int_handler_tag); adapter->int_handler_tag = NULL; } - if (adapter->tq != NULL) { - taskqueue_drain(adapter->tq, &adapter->rxtx_task); - taskqueue_drain(taskqueue_fast, &adapter->link_task); - taskqueue_free(adapter->tq); - adapter->tq = NULL; + taskqueue_drain(taskqueue_fast, &adapter->link_task); + + adapter->rx_shutdown_flag=TRUE; + for (int i = 0; i < RX_KTHREADS_NUM; i++) { + if (adapter->rx_kthreads_handles[i]) + tsleep(adapter->rx_kthreads_handles[i], 0, "RXSTOP", 3*hz); } } @@ -3138,7 +3221,7 @@ em_initialize_transmit_unit(struct adapt E1000_WRITE_REG(&adapter->hw, E1000_TIDV, adapter->tx_int_delay.value); if(adapter->hw.mac.type >= e1000_82540) E1000_WRITE_REG(&adapter->hw, E1000_TADV, - adapter->tx_abs_int_delay.value); + EM_USECS_TO_TICKS(adapter->tx_abs_int_delay.value)); if ((adapter->hw.mac.type == e1000_82571) || (adapter->hw.mac.type == e1000_82572)) { @@ -3364,6 +3447,10 @@ em_transmit_checksum_setup(struct adapte adapter->num_tx_desc_avail--; adapter->next_avail_tx_desc = curr_txd; + + adapter->tx_counter=0; + adapter->tx_counter_watchdog_mark=0; + adapter->tx_tdh_watchdog_mark=0; } /********************************************************************** @@ -3736,7 +3823,7 @@ em_txeof(struct adapter *adapter) struct e1000_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = adapter->ifp; - EM_LOCK_ASSERT(adapter); + EM_TXLOCK_ASSERT(adapter); if (adapter->num_tx_desc_avail == adapter->num_tx_desc) return; @@ -3809,15 +3896,8 @@ em_txeof(struct adapter *adapter) * If there are no pending descriptors, clear the timeout. Otherwise, * if some descriptors have been freed, restart the timeout. */ - if (num_avail > EM_TX_CLEANUP_THRESHOLD) { + if (num_avail > EM_TX_CLEANUP_THRESHOLD) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - /* All clean, turn off the timer */ - if (num_avail == adapter->num_tx_desc) - adapter->watchdog_timer = 0; - /* Some cleaned, reset the timer */ - else if (num_avail != adapter->num_tx_desc_avail) - adapter->watchdog_timer = EM_TX_TIMEOUT; - } adapter->num_tx_desc_avail = num_avail; return; } @@ -4144,7 +4224,7 @@ em_free_receive_structures(struct adapte * *********************************************************************/ static int -em_rxeof(struct adapter *adapter, int count) +em_rxeof(struct adapter *adapter, int count, int myKthreadNo) { struct ifnet *ifp; struct mbuf *mp; @@ -4298,15 +4378,57 @@ discard: if (++i == adapter->num_rx_desc) i = 0; if (m != NULL) { + struct ip *ip = mtod(m, struct ip *); + adapter->next_rx_desc_to_check = i; -#ifdef DEVICE_POLLING - EM_UNLOCK(adapter); - (*ifp->if_input)(ifp, m); - EM_LOCK(adapter); -#else - /* Already running unlocked */ + + /* + * Trick to avoid reorder: + * + * Don't allow change order of tcp packets + * in same session. In order to make this + * easier, we will not allow to process packets + * from one same source with more than one CPU. + */ + int hlen = ip->ip_hl << 2; + if (hlen >= sizeof(struct ip)) { /* minimum header length */ + adapter -> rxIpBeingProcessed[myKthreadNo]=ip->ip_src.s_addr; + + if (ip->ip_src.s_addr) + for (int k=0; k < RX_KTHREADS_NUM; k++) { + if ((adapter->rxIpBeingProcessed[k] == ip->ip_src.s_addr) + && !adapter->waitedBy[k]) { + /* + * Packet from the same IP is being processed + * by another thread, wait until that was done. + */ + adapter->reorder_cnt++; + adapter->waitedBy[k] = myKthreadNo; + msleep(adapter->rxIpBeingProcessed+k, + &adapter->rxmtx, + adapter->rx_kthread_priority, + "RORDER", -1); + } + } + } else + ip = NULL; + + EM_RXUNLOCK(adapter); + (*ifp->if_input)(ifp, m); -#endif + + EM_RXLOCK(adapter); + + adapter->rxIpBeingProcessed[myKthreadNo]=0; + + if (adapter->waitedBy[myKthreadNo]) { + /* + * Wakeup threads blocking on our packet process + * procedure due to the reorder prevention check + */ + wakeup(adapter->rxIpBeingProcessed+myKthreadNo); + adapter->waitedBy[myKthreadNo] = 0; + } i = adapter->next_rx_desc_to_check; } current_desc = &adapter->rx_desc_base[i]; @@ -4438,6 +4560,18 @@ em_disable_intr(struct adapter *adapter) E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); } +static void +em_enable_intr_rx(struct adapter *adapter) +{ + E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_RXT0 | E1000_IMS_RXDMT0 | E1000_IMS_RXO); +} + +static void +em_disable_intr_rx(struct adapter *adapter) +{ + E1000_WRITE_REG(&adapter->hw, E1000_IMC, E1000_IMS_RXT0 | E1000_IMS_RXDMT0 | E1000_IMS_RXO); +} + /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka @@ -4878,6 +5012,8 @@ em_print_debug_info(struct adapter *adap adapter->dropped_pkts); device_printf(dev, "Driver tx dma failure in encap = %ld\n", adapter->no_tx_dma_setup); + device_printf(dev, "Packets pended due to reorder = %ld\n", + adapter->reorder_cnt); } static void @@ -4996,7 +5132,8 @@ em_sysctl_int_delay(SYSCTL_HANDLER_ARGS) adapter = info->adapter; - EM_LOCK(adapter); + EM_RXLOCK(adapter); + EM_TXLOCK(adapter); regval = E1000_READ_OFFSET(&adapter->hw, info->offset); regval = (regval & ~0xffff) | (ticks & 0xffff); /* Handle a few special cases. */ @@ -5014,7 +5151,8 @@ em_sysctl_int_delay(SYSCTL_HANDLER_ARGS) break; } E1000_WRITE_OFFSET(&adapter->hw, info->offset, regval); - EM_UNLOCK(adapter); + EM_TXUNLOCK(adapter); + EM_RXUNLOCK(adapter); return (0); } @@ -5034,7 +5172,7 @@ em_add_int_delay_sysctl(struct adapter * #ifndef DEVICE_POLLING static void -em_add_rx_process_limit(struct adapter *adapter, const char *name, +em_add_int_rx_kthread_priority(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; Index: if_em.h =================================================================== RCS file: /home/ncvs/src/sys/dev/em/if_em.h,v retrieving revision 1.62 diff -u -p -r1.62 if_em.h --- if_em.h 10 Sep 2007 21:50:40 -0000 1.62 +++ if_em.h 3 Oct 2007 21:35:44 -0000 @@ -82,7 +82,7 @@ POSSIBILITY OF SUCH DAMAGE. * system is reporting dropped transmits, this value may be set too high * causing the driver to run out of available transmit descriptors. */ -#define EM_TIDV 64 +#define EM_TIDV 65535 /* * EM_TADV - Transmit Absolute Interrupt Delay Value @@ -96,7 +96,7 @@ POSSIBILITY OF SUCH DAMAGE. * along with EM_TIDV, may improve traffic throughput in specific * network conditions. */ -#define EM_TADV 64 +#define EM_TADV 65535 /* * EM_RDTR - Receive Interrupt Delay Timer (Packet Timer) @@ -130,12 +130,12 @@ POSSIBILITY OF SUCH DAMAGE. * along with EM_RDTR, may improve traffic throughput in specific network * conditions. */ -#define EM_RADV 64 +#define EM_RADV 977 /* * This parameter controls the duration of transmit watchdog timer. */ -#define EM_TX_TIMEOUT 5 /* set to 5 seconds */ +#define EM_TX_TIMEOUT 2 /* set to 2 seconds */ /* * This parameter controls when the driver calls the routine to reclaim @@ -270,15 +270,31 @@ struct adapter { struct ifmedia media; struct callout timer; struct callout tx_fifo_timer; - int watchdog_timer; + + unsigned tx_counter; + unsigned tx_counter_watchdog_mark; + unsigned tx_tdh_watchdog_mark; + int io_rid; int msi; int if_flags; - struct mtx mtx; int em_insert_vlan_header; + + /* RX/TX locks */ + struct mtx rxmtx; + struct mtx txmtx; + struct task link_task; - struct task rxtx_task; - struct taskqueue *tq; /* private task queue */ + +#define RX_KTHREADS_NUM 2 + struct proc *rx_kthreads_handles[RX_KTHREADS_NUM]; + int rx_shutdown_flag; + + in_addr_t rxIpBeingProcessed[RX_KTHREADS_NUM]; + int waitedBy[RX_KTHREADS_NUM]; + int rxKthreadNo; + unsigned long reorder_cnt; + /* Management and WOL features */ int wol; int has_manage; @@ -333,7 +349,7 @@ struct adapter { uint32_t next_rx_desc_to_check; uint32_t rx_buffer_len; uint16_t num_rx_desc; - int rx_process_limit; + int rx_kthread_priority; struct em_buffer *rx_buffer_area; bus_dma_tag_t rxtag; bus_dmamap_t rx_sparemap; @@ -413,11 +429,20 @@ typedef struct _DESCRIPTOR_PAIR uint32_t elements; } DESC_ARRAY, *PDESC_ARRAY; -#define EM_LOCK_INIT(_sc, _name) \ - mtx_init(&(_sc)->mtx, _name, MTX_NETWORK_LOCK, MTX_DEF) -#define EM_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->mtx) -#define EM_LOCK(_sc) mtx_lock(&(_sc)->mtx) -#define EM_UNLOCK(_sc) mtx_unlock(&(_sc)->mtx) -#define EM_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->mtx, MA_OWNED) +#define EM_RXLOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->rxmtx, _name, MTX_NETWORK_LOCK, MTX_DEF) +#define EM_RXLOCK_DESTROY(_sc) mtx_destroy(&(_sc)->rxmtx) +#define EM_RXLOCK(_sc) mtx_lock(&(_sc)->rxmtx) +#define EM_RXTRYLOCK(_sc) mtx_trylock(&(_sc)->rxmtx) +#define EM_RXUNLOCK(_sc) mtx_unlock(&(_sc)->rxmtx) +#define EM_RXLOCK_ASSERT(_sc) mtx_assert(&(_sc)->rxmtx, MA_OWNED) + +#define EM_TXLOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->txmtx, _name, MTX_NETWORK_LOCK, MTX_DEF) +#define EM_TXLOCK_DESTROY(_sc) mtx_destroy(&(_sc)->txmtx) +#define EM_TXLOCK(_sc) mtx_lock(&(_sc)->txmtx) +#define EM_TXTRYLOCK(_sc) mtx_trylock(&(_sc)->txmtx) +#define EM_TXUNLOCK(_sc) mtx_unlock(&(_sc)->txmtx) +#define EM_TXLOCK_ASSERT(_sc) mtx_assert(&(_sc)->txmtx, MA_OWNED) #endif /* _EM_H_DEFINED_ */