[PATCH] A new entry for /proc

3 views
Skip to first unread message

Mauricio Lin

unread,
Jan 6, 2005, 4:18:25 PM1/6/05
to linux-...@vger.kernel.org
Hi all,

Here is a new entry developed for /proc that prints for each process
memory area (VMA) the size of rss. The maps from original kernel is
able to present the virtual size for each vma, but not the physical
size (rss). This entry can provide an additional information for tools
that analyze the memory consumption. You can know the physical memory
size of each library used by a process and also the executable file.

Take a look the output:
# cat /proc/877/smaps
08048000-08132000 r-xp /usr/bin/xmms
Size: 936 kB
Rss: 788 kB
08132000-0813a000 rw-p /usr/bin/xmms
Size: 32 kB
Rss: 32 kB
0813a000-081dd000 rw-p
Size: 652 kB
Rss: 616 kB
b67b5000-b67b6000 ---p
Size: 4 kB
Rss: 0 kB
b67b6000-b6fb5000 rwxp
Size: 8188 kB
Rss: 4 kB
b6fb5000-b6fbc000 r-xp /usr/lib/xmms/Visualization/libsanalyzer.so
Size: 28 kB
Rss: 8 kB
b6fbc000-b6fbd000 rw-p /usr/lib/xmms/Visualization/libsanalyzer.so
Size: 4 kB
Rss: 4 kB
b6fbd000-b6fc1000 r-xp /usr/X11R6/lib/libXxf86vm.so.1.0
Size: 16 kB
Rss: 8 kB
b6fc1000-b6fc2000 rw-p /usr/X11R6/lib/libXxf86vm.so.1.0
Size: 4 kB
Rss: 4 kB
b6fc2000-b702d000 r-xp /usr/X11R6/lib/libGL.so.1.2
Size: 428 kB
Rss: 372 kB
b702d000-b7032000 rw-p /usr/X11R6/lib/libGL.so.1.2
Size: 20 kB
Rss: 20 kB
b7032000-b7033000 rw-p
Size: 4 kB
Rss: 0 kB
...

Here is the patch:

diff -rcNP linux-2.6.10/fs/proc/base.c linux-2.6.10-smaps/fs/proc/base.c
*** linux-2.6.10/fs/proc/base.c Fri Dec 24 17:35:00 2004
--- linux-2.6.10-smaps/fs/proc/base.c Thu Jan 6 15:47:37 2005
***************
*** 11,16 ****
--- 11,18 ----
* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ * 2004, 10LE INdT <mauric...@indt.org.br>. A new entry smaps
included in /proc.
+ * It shows the size of rss for each memory area.
*/

#include <asm/uaccess.h>
***************
*** 60,65 ****
--- 62,68 ----
PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif
***************
*** 86,91 ****
--- 89,95 ----
PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif
***************
*** 123,128 ****
--- 127,133 ----
E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif
***************
*** 148,153 ****
--- 153,159 ----
E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif
***************
*** 497,502 ****
--- 503,527 ----
.release = seq_release,
};

+ extern struct seq_operations proc_pid_smaps_op;
+ static int smaps_open(struct inode *inode, struct file *file)
+ {
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+ }
+
+ static struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ };
+
extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{
***************
*** 1341,1346 ****
--- 1366,1376 ----
case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;
+
#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;
diff -rcNP linux-2.6.10/fs/proc/task_mmu.c linux-2.6.10-smaps/fs/proc/task_mmu.c
*** linux-2.6.10/fs/proc/task_mmu.c Fri Dec 24 17:34:01 2004
--- linux-2.6.10-smaps/fs/proc/task_mmu.c Wed Dec 29 10:21:04 2004
***************
*** 81,86 ****
--- 81,159 ----
return 0;
}

+ static void resident_mem_size(struct mm_struct *mm, unsigned long
start_address,
+ unsigned long end_address, unsigned long *size) {
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ unsigned long page;
+
+ for (page = start_address; page < end_address; page += PAGE_SIZE) {
+ pgd = pgd_offset(mm, page);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ continue;
+
+ pmd = pmd_offset(pgd, page);
+
+ if (pmd_none(*pmd))
+ continue;
+
+ if (unlikely(pmd_bad(*pmd)))
+ continue;
+
+ if (pmd_present(*pmd)) {
+ ptep = pte_offset_map(pmd, page);
+ if (!ptep)
+ continue;
+ pte = *ptep;
+ pte_unmap(ptep);
+ if (pte_present(pte)) {
+ *size += PAGE_SIZE;
+ }
+ }
+ }
+ }
+
+ static int show_smap(struct seq_file *m, void *v)
+ {
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;
+ int len;
+ struct mm_struct *mm = map->vm_mm;
+ unsigned long rss = 0;
+ unsigned long vma_len = (map->vm_end - map->vm_start) >> 10;
+
+ if (mm) {
+ resident_mem_size(mm, map->vm_start, map->vm_end, &rss);
+ }
+
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %n",
+ map->vm_start,
+ map->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p',
+ &len);
+
+ if (map->vm_file) {
+ len = sizeof(void*) * 6 - len;
+ if (len < 1)
+ len = 1;
+ seq_printf(m, "%*c", len, ' ');
+ seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+ }
+ seq_putc(m, '\n');
+
+ seq_printf(m, "Size:%8lu kB\n"
+ "Rss:%8lu kB\n",
+ vma_len,
+ rss >> 10);
+
+ return 0;
+ }
+
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct task_struct *task = m->private;
***************
*** 134,136 ****
--- 207,216 ----
.stop = m_stop,
.show = show_map
};
+
+ struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+ };

Suggestions are welcome.

BR,

Mauricio Lin.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Andrew Morton

unread,
Jan 6, 2005, 11:25:17 PM1/6/05
to Mauricio Lin, linux-...@vger.kernel.org
Mauricio Lin <mauri...@gmail.com> wrote:
>
> Here is a new entry developed for /proc that prints for each process
> memory area (VMA) the size of rss. The maps from original kernel is
> able to present the virtual size for each vma, but not the physical
> size (rss). This entry can provide an additional information for tools
> that analyze the memory consumption. You can know the physical memory
> size of each library used by a process and also the executable file.
>
> Take a look the output:
> # cat /proc/877/smaps
> 08048000-08132000 r-xp /usr/bin/xmms
> Size: 936 kB
> Rss: 788 kB

This is potentially quite useful. I'd be interested in what others think of
the idea and implementation.

> Here is the patch:

- It was wordwrapped. Mail the patch to yourself first, make sure it
still applies.

- Prepare patches with `diff -u'

-

> + extern struct seq_operations proc_pid_smaps_op;

Put extern headers in .h files, not in .c.


> + static void resident_mem_size(struct mm_struct *mm, unsigned long
> start_address,
> + unsigned long end_address, unsigned long *size) {
> + pgd_t *pgd;
> + pmd_t *pmd;
> + pte_t *ptep, pte;
> + unsigned long page;

The identifier `page' is usually used for pointers to struct page. Please
pick another name?

> + if (pte_present(pte)) {
> + *size += PAGE_SIZE;
> + }

We prefer to omit the braces if they enclose only a single statement.

> + if (map->vm_file) {
> + len = sizeof(void*) * 6 - len;
> + if (len < 1)
> + len = 1;
> + seq_printf(m, "%*c", len, ' ');
> + seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
> + }

hm, that's a bit bizarre. Isn't there a printf construct which will do the
right-alignment for you? %8u? (See meminfo_read_proc())

Roger Luethi

unread,
Jan 7, 2005, 7:37:33 AM1/7/05
to Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org
On Thu, 06 Jan 2005 20:23:39 -0800, Andrew Morton wrote:
> Mauricio Lin <mauri...@gmail.com> wrote:
> >
> > Here is a new entry developed for /proc that prints for each process
> > memory area (VMA) the size of rss. The maps from original kernel is
> > able to present the virtual size for each vma, but not the physical
> > size (rss). This entry can provide an additional information for tools
> > that analyze the memory consumption. You can know the physical memory
> > size of each library used by a process and also the executable file.
> >
> > Take a look the output:
> > # cat /proc/877/smaps
> > 08048000-08132000 r-xp /usr/bin/xmms
> > Size: 936 kB
> > Rss: 788 kB
>
> This is potentially quite useful. I'd be interested in what others think of
> the idea and implementation.

With split interfaces (machine-/human-readable) as proposed a few months
ago, we wouldn't need to clutter /proc with such cruft. We could simply
add the obvious field to /proc/maps and add another field to nproc.

Using procfs for both humans and software means over time it will get
worse for _both_ sides, and switching to a saner solution won't get
cheaper, either. I still believe we should bite that bullet now.

Roger

Hugh Dickins

unread,
Jan 8, 2005, 3:23:58 PM1/8/05
to Andrew Morton, Mauricio Lin, William Irwin, linux-...@vger.kernel.org
On Thu, 6 Jan 2005, Andrew Morton wrote:
> Mauricio Lin <mauri...@gmail.com> wrote:
> >
> > Here is a new entry developed for /proc that prints for each process
> > memory area (VMA) the size of rss. The maps from original kernel is
> > able to present the virtual size for each vma, but not the physical
> > size (rss). This entry can provide an additional information for tools
> > that analyze the memory consumption. You can know the physical memory
> > size of each library used by a process and also the executable file.
> >
> > Take a look the output:
> > # cat /proc/877/smaps
> > 08048000-08132000 r-xp /usr/bin/xmms
> > Size: 936 kB
> > Rss: 788 kB
>
> This is potentially quite useful. I'd be interested in what others think of
> the idea and implementation.

Regarding the idea.

Well, it goes back to just what wli freed 2.6 from, and what we scorned
clameter for: a costly examination of every pte-slot of every vma of the
process. That doesn't matter _too_ much so long as there's no standard
tool liable to do it every second or so, nor doing it to every single
process, and it doesn't need spinlock or preemption disabled too long.

But personally I'd be happier for it to remain an out-of-tree patch,
just to discourage people from writing and running such tools,
and to discourage them from adding other such costly analyses.

Potentially quite useful, perhaps. But I don't have a use for it
myself, and if I do have, I'll be content to search out (or recreate)
the patch. Let's hear from those who actually have a use for it now -
the more useful it is, of course, the stronger the argument for inclusion.

I am a bit sceptical how useful such a lot of little numbers would
really be - usually it's an overall picture we're interested in.

Regarding the implementation.

Unnecessarily inefficient: a pte_offset_map and unmap for each pte.
Better go back to the 2.4.28 or 2.5.36 fs/proc/array.c design for
statm_pgd_range + statm_pmd_range + statm_pte_range - but now you
need a pud level too.

Seems to have no locking: needs to down_read mmap_sem to guard vmas.
Does it need page_table_lock? I think not (and proc_pid_statm didn't).

If there were a use for it, that use might want to distinguish between
the "shared rss" of pagecache pages from a file, and the "anon rss" of
private pages copied from file or originally zero - would need to get
the struct page and check PageAnon. And might want to count swap
entries too. Hard to say without real uses in mind.

Andrew mentioned "unsigned long page": similarly, we usually say
"struct vm_area_struct *vma" rather than "*map" (well, some places
say "*mpnt", but that's not a precedent to follow).

Regarding the display.

It's a mixture of two different styles, the /proc/<pid>/maps
many-hex-fields one-vma-per-line style and the /proc/meminfo
one-decimal-kB-per-line style. I think it would be better following
the /proc/<pid>/maps style, but replacing the major,minor,ino fields
by size and rss (anon_rss? swap?) fields (decimal kB? I suppose so).

Hugh

Alan Cox

unread,
Jan 8, 2005, 6:00:29 PM1/8/05
to Hugh Dickins, Andrew Morton, Mauricio Lin, William Irwin, Linux Kernel Mailing List
On Sad, 2005-01-08 at 20:20, Hugh Dickins wrote:
> Seems to have no locking: needs to down_read mmap_sem to guard vmas.
> Does it need page_table_lock? I think not (and proc_pid_statm didn't).

The mmap_sem is insufficient during an exec - something that the
sys_uselib changes noted. It would work in -ac but thats a property of
how I fixed it rather than how Linus fixed it for the base kernel.

Edjard Souza Mota

unread,
Jan 10, 2005, 4:24:18 AM1/10/05
to Hugh Dickins, Andrew Morton, Mauricio Lin, William Irwin, linux-...@vger.kernel.org
Hi,

Regarding he Code:

I am not an expert like you guys, but I worked with the first version of this
proposal about a year ago and posted it to this list. At that time we had no
idea about how useful it migh be. Lin continued the work and improved it,
and I can see for the feedbacks he will be able to meet the standards.

Regarding the idea:

IMHO, I belive we've just found a potentially important use of it. We tested
the old version of this patch for applications running on the Sarge Debian
Distro for ARM platform (can be found ftp.debian.org/dists/sarge).
We tested only on OMAP 1510 and 5912. The advantage we found is that
these little numbers can be used to help control the memory consumption
for embedded devices like the one we tested. A fine grain memory control
can help us envisage how memory is consumed with such systems that
usually lack swap space. If they ever have (allocating space from flash
memory) the tiny control give us a better picuture where we should
work on optimising memory consumption.

We intend to test this new patch and see how it works with more applications.
Other tests would help comparing the results we will send soon.

br

Edjard


--
"In a world without fences ... who needs Gates?"

Mauricio Lin

unread,
Jan 10, 2005, 9:37:27 AM1/10/05
to Andrew Morton, linux-...@vger.kernel.org
On Thu, 6 Jan 2005 20:23:39 -0800, Andrew Morton <ak...@osdl.org> wrote:
> Mauricio Lin <mauri...@gmail.com> wrote:
> >
> > Here is a new entry developed for /proc that prints for each process
> > memory area (VMA) the size of rss. The maps from original kernel is
> > able to present the virtual size for each vma, but not the physical
> > size (rss). This entry can provide an additional information for tools
> > that analyze the memory consumption. You can know the physical memory
> > size of each library used by a process and also the executable file.
> >
> > Take a look the output:
> > # cat /proc/877/smaps
> > 08048000-08132000 r-xp /usr/bin/xmms
> > Size: 936 kB
> > Rss: 788 kB
>
> This is potentially quite useful. I'd be interested in what others think of
> the idea and implementation.
>
> > Here is the patch:
>
> - It was wordwrapped. Mail the patch to yourself first, make sure it
> still applies.
>
> - Prepare patches with `diff -u'
OK.

>
> -
>
> > + extern struct seq_operations proc_pid_smaps_op;
>
> Put extern headers in .h files, not in .c.
OK.

>
>
> > + static void resident_mem_size(struct mm_struct *mm, unsigned long
> > start_address,
> > + unsigned long end_address, unsigned long *size) {
> > + pgd_t *pgd;
> > + pmd_t *pmd;
> > + pte_t *ptep, pte;
> > + unsigned long page;
>
> The identifier `page' is usually used for pointers to struct page. Please
> pick another name?
OK.

>
> > + if (pte_present(pte)) {
> > + *size += PAGE_SIZE;
> > + }
>
> We prefer to omit the braces if they enclose only a single statement.
>
> > + if (map->vm_file) {
> > + len = sizeof(void*) * 6 - len;
> > + if (len < 1)
> > + len = 1;
> > + seq_printf(m, "%*c", len, ' ');
> > + seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
> > + }
>
> hm, that's a bit bizarre. Isn't there a printf construct which will do the
> right-alignment for you? %8u? (See meminfo_read_proc())
OK, we will follow your suggestion. This was an informal PATCH, just
to present the idea of this work for the list. So the mistakes you
pointed will be fixed and I will send to the list a more formal PATCH
with the developers names attached. OK?

Mauricio Lin

unread,
Jan 10, 2005, 10:25:44 AM1/10/05
to Hugh Dickins, Andrew Morton, William Irwin, linux-...@vger.kernel.org
On Sat, 8 Jan 2005 20:20:39 +0000 (GMT), Hugh Dickins <hu...@veritas.com> wrote:
> On Thu, 6 Jan 2005, Andrew Morton wrote:
> > Mauricio Lin <mauri...@gmail.com> wrote:
> > >
> > > Here is a new entry developed for /proc that prints for each process
> > > memory area (VMA) the size of rss. The maps from original kernel is
> > > able to present the virtual size for each vma, but not the physical
> > > size (rss). This entry can provide an additional information for tools
> > > that analyze the memory consumption. You can know the physical memory
> > > size of each library used by a process and also the executable file.
> > >
> > > Take a look the output:
> > > # cat /proc/877/smaps
> > > 08048000-08132000 r-xp /usr/bin/xmms
> > > Size: 936 kB
> > > Rss: 788 kB
> >
> > This is potentially quite useful. I'd be interested in what others think of
> > the idea and implementation.
>
> Regarding the implementation.
>
> Unnecessarily inefficient: a pte_offset_map and unmap for each pte.
> Better go back to the 2.4.28 or 2.5.36 fs/proc/array.c design for
> statm_pgd_range + statm_pmd_range + statm_pte_range - but now you
> need a pud level too.
>
> Seems to have no locking: needs to down_read mmap_sem to guard vmas.
There are down_read and up_read inside the functions. The
proc_pid_smaps_op has fields that points to these functions that treat
the down_read and up_read related to mmap_sem.
The smaps implementation is based on map entry, so you can check it.
Actually this implementation could be included in maps, but create a
new entry is better, because we do not want to mess up the original
maps entry for a while.

+ struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+ };

> Does it need page_table_lock? I think not (and proc_pid_statm didn't).


>
> If there were a use for it, that use might want to distinguish between
> the "shared rss" of pagecache pages from a file, and the "anon rss" of
> private pages copied from file or originally zero - would need to get
> the struct page and check PageAnon. And might want to count swap
> entries too. Hard to say without real uses in mind.

Let's wait for new results.


>
> Andrew mentioned "unsigned long page": similarly, we usually say
> "struct vm_area_struct *vma" rather than "*map" (well, some places
> say "*mpnt", but that's not a precedent to follow).

OK, this can be changed.


>
> Regarding the display.
>
> It's a mixture of two different styles, the /proc/<pid>/maps
> many-hex-fields one-vma-per-line style and the /proc/meminfo
> one-decimal-kB-per-line style. I think it would be better following
> the /proc/<pid>/maps style, but replacing the major,minor,ino fields
> by size and rss (anon_rss? swap?) fields (decimal kB? I suppose so).

The ouput format can be changed, because I also prefer the maps
format. The temporary format is just for people to understand it
easily.

Mauricio Lin

unread,
Jan 14, 2005, 5:50:36 PM1/14/05
to Andrew Morton, linux-...@vger.kernel.org
Hi All,

> > + extern struct seq_operations proc_pid_smaps_op;
>
> Put extern headers in .h files, not in .c.

Andrew, I did not put the extern headers in .h files, because I
noticed that there are other extern headers in base.c, so I think
would be better follow the code style. If it is necessary to put it in
.h file, which file do I have to put (or create a new one)?

Here goes the new PATCH.

**************************************
PATCH
**************************************

diff -urN linux-2.6.10/fs/proc/base.c linux-2.6.10-smaps/fs/proc/base.c
--- linux-2.6.10/fs/proc/base.c 2004-12-24 17:35:00.000000000 -0400
+++ linux-2.6.10-smaps/fs/proc/base.c 2005-01-14 17:07:30.000000000 -0400
@@ -11,6 +11,18 @@


* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ *

+ *
+ * 2005
+ * Allan Bezerra
+ * Bruna Moreira <bruna....@indt.org.br>
+ * Edjard Mota <edjar...@indt.org.br>
+ * Ilias Biris <ext-ili...@indt.org.br>
+ * Mauricio Lin <mauric...@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ * A new entry smaps included in /proc. It shows the size of rss for


each memory area.
*/

#include <asm/uaccess.h>

@@ -60,6 +72,7 @@


PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif

@@ -86,6 +99,7 @@


PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif

@@ -123,6 +137,7 @@


E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -148,6 +163,7 @@


E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -497,6 +513,25 @@
.release = seq_release,
};

+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)


+{
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+

+static struct file_operations proc_smaps_operations = {


+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{

@@ -1341,6 +1376,11 @@


case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;
+
#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;

diff -urN linux-2.6.10/fs/proc/task_mmu.c linux-2.6.10-smaps/fs/proc/task_mmu.c
--- linux-2.6.10/fs/proc/task_mmu.c 2004-12-24 17:34:01.000000000 -0400
+++ linux-2.6.10-smaps/fs/proc/task_mmu.c 2005-01-14 16:35:38.000000000 -0400
@@ -81,6 +81,72 @@
return 0;
}

+static void resident_mem_size(struct mm_struct *mm, unsigned long


start_address,
+ unsigned long end_address, unsigned long *size) {
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *ptep, pte;

+ unsigned long each_page;
+
+ for (each_page = start_address; each_page < end_address; each_page
+= PAGE_SIZE) {
+ pgd = pgd_offset(mm, each_page);


+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ continue;
+

+ pmd = pmd_offset(pgd, each_page);


+
+ if (pmd_none(*pmd))
+ continue;
+
+ if (unlikely(pmd_bad(*pmd)))
+ continue;
+
+ if (pmd_present(*pmd)) {

+ ptep = pte_offset_map(pmd, each_page);


+ if (!ptep)
+ continue;
+ pte = *ptep;
+ pte_unmap(ptep);

+ if (pte_present(pte))
+ *size += PAGE_SIZE;
+ }

+ }
+}
+
+static int show_smap(struct seq_file *m, void *v)


+{
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;

+ struct mm_struct *mm = map->vm_mm;
+ unsigned long rss = 0;
+ unsigned long vma_len = (map->vm_end - map->vm_start) >> 10;
+
+ if (mm) {
+ resident_mem_size(mm, map->vm_start, map->vm_end, &rss);
+ }
+

+ seq_printf(m, "%08lx-%08lx %c%c%c%c ",


+ map->vm_start,
+ map->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',

+ flags & VM_MAYSHARE ? 's' : 'p');
+
+ if (map->vm_file)


+ seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+

+ seq_putc(m, '\n');
+
+ seq_printf(m, "Size:%8lu kB\n"
+ "Rss:%8lu kB\n",
+ vma_len,
+ rss >> 10);
+
+ return 0;
+}
+
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct task_struct *task = m->private;

@@ -134,3 +200,10 @@


.stop = m_stop,
.show = show_map
};
+

+struct seq_operations proc_pid_smaps_op = {


+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};


The smaps was used to verify the memory consumption of gecko rendering
engine and has presented interesting results in a detailed way. Below
is the link to access the graphics generated by the application that
reads from /proc/pid/smaps.

http://www.manaos.org/10le/smaps.html

BR,

Mauricio Lin.

Mauricio Lin

unread,
Jan 17, 2005, 1:19:54 PM1/17/05
to Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org
Hi Andrew,

Sorry for the patch errors.

Here goes the fixed patch. I used the xemacs editor for copying it.
The others editor (emacs and pico) I tried do not copy the patch
rightly. The patch copying also does not work with webmail.

diff -uprN linux-2.6.10/Documentation/filesystems/proc.txt
linux-2.6.10-smaps/Documentation/filesystems/proc.txt
--- linux-2.6.10/Documentation/filesystems/proc.txt 2004-12-24
17:34:29.000000000 -0400
+++ linux-2.6.10-smaps/Documentation/filesystems/proc.txt 2005-01-17
11:29:31.000000000 -0400
@@ -133,6 +133,7 @@ Table 1-1: Process specific entries in /
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension of maps, presenting the rss size for each mapped file
..............................................................................

For example, to get the status information of a process, all you have to do is
diff -uprN linux-2.6.10/fs/proc/base.c linux-2.6.10-smaps/fs/proc/base.c


--- linux-2.6.10/fs/proc/base.c 2004-12-24 17:35:00.000000000 -0400

+++ linux-2.6.10-smaps/fs/proc/base.c 2005-01-17 12:11:01.000000000 -0400
@@ -11,6 +11,24 @@


* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ *
+ *

+ * Changelog:
+ * 17-Jan-2005


+ * Allan Bezerra
+ * Bruna Moreira <bruna....@indt.org.br>
+ * Edjard Mota <edjar...@indt.org.br>
+ * Ilias Biris <ext-ili...@indt.org.br>
+ * Mauricio Lin <mauric...@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *

+ * A new process specific entry (smaps) included in /proc. It shows the
+ * size of rss for each memory area. The maps entry lacks information
+ * about physical memory size (rss) for each mapped file, i.e.,
+ * rss information for executables and library files.
+ * This additional information is useful for any tools that need to know
+ * about physical memory consumption for a process specific library.
*/

#include <asm/uaccess.h>
@@ -60,6 +78,7 @@ enum pid_directory_inos {


PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif

@@ -86,6 +105,7 @@ enum pid_directory_inos {


PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif

@@ -123,6 +143,7 @@ static struct pid_entry tgid_base_stuff[


E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -148,6 +169,7 @@ static struct pid_entry tid_base_stuff[]


E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -497,6 +519,25 @@ static struct file_operations proc_maps_


.release = seq_release,
};

+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{

@@ -1341,6 +1382,11 @@ static struct dentry *proc_pident_lookup


case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;
+
#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;

diff -uprN linux-2.6.10/fs/proc/task_mmu.c linux-2.6.10-smaps/fs/proc/task_mmu.c


--- linux-2.6.10/fs/proc/task_mmu.c 2004-12-24 17:34:01.000000000 -0400

+++ linux-2.6.10-smaps/fs/proc/task_mmu.c 2005-01-17 09:29:38.000000000 -0400
@@ -81,6 +81,75 @@ static int show_map(struct seq_file *m,

return 0;
}

+static void resident_mem_size(struct mm_struct *mm,

+ unsigned long start_address,
+ unsigned long end_address,
+ unsigned long *size)
+{

@@ -134,3 +203,10 @@ struct seq_operations proc_pid_maps_op =


.stop = m_stop,
.show = show_map
};
+
+struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};

Mauricio Lin

unread,
Jan 17, 2005, 2:05:04 PM1/17/05
to Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org
Hi Andrew,

I figured out the error. This patch works for others editors as well.

+++ linux-2.6.10-smaps/fs/proc/task_mmu.c 2005-01-17 14:55:17.000000000 -0400
@@ -81,6 +81,76 @@ static int show_map(struct seq_file *m,

return 0;
}

+static void resident_mem_size(struct mm_struct *mm,
+ unsigned long start_address,
+ unsigned long end_address,
+ unsigned long *size)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ unsigned long each_page;
+
+ for (each_page = start_address; each_page < end_address;

+ each_page += PAGE_SIZE) {

@@ -134,3 +204,10 @@ struct seq_operations proc_pid_maps_op =

Marcelo Tosatti

unread,
Jan 17, 2005, 3:39:48 PM1/17/05
to Mauricio Lin, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org, Nick Piggin

Hi Mauricio,

On Mon, Jan 17, 2005 at 03:02:14PM -0400, Mauricio Lin wrote:
> Hi Andrew,
>
> I figured out the error. This patch works for others editors as well.

<snip>

You want to update your patch to handle the new 4level pagetables which introduces
a new indirection table: the PUD.

Check 2.6.11-rc1 - mm/rmap.c.

BTW: What does PUD stand for?

Mauricio Lin

unread,
Jan 17, 2005, 4:30:34 PM1/17/05
to Marcelo Tosatti, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org, Nick Piggin, Edjar...@indt.org.br
Hi Tosatti,

OK, I will check the new pagetable included in 2.6.11-rc1 and change
the navigation algorithm of page table entries.

Thanks.

BR,

Mauricio Lin.

William Lee Irwin III

unread,
Jan 17, 2005, 4:39:14 PM1/17/05
to Marcelo Tosatti, Mauricio Lin, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org, Nick Piggin
On Mon, Jan 17, 2005 at 03:30:23PM -0200, Marcelo Tosatti wrote:
> You want to update your patch to handle the new 4level pagetables
> which introduces a new indirection table: the PUD.
> Check 2.6.11-rc1 - mm/rmap.c.
> BTW: What does PUD stand for?

Page Upper Directory. It also is used in a particular euphemism that made
it seem odd to me. I suspect it wasn't thought of when it was chosen.


-- wli

Nick Piggin

unread,
Jan 17, 2005, 8:15:19 PM1/17/05
to William Lee Irwin III, Marcelo Tosatti, Mauricio Lin, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org

William Lee Irwin III wrote:

>On Mon, Jan 17, 2005 at 03:30:23PM -0200, Marcelo Tosatti wrote:
>
>>You want to update your patch to handle the new 4level pagetables
>>which introduces a new indirection table: the PUD.
>>Check 2.6.11-rc1 - mm/rmap.c.
>>BTW: What does PUD stand for?
>>
>
>Page Upper Directory.
>

That's right.

> It also is used in a particular euphemism that made
>it seem odd to me. I suspect it wasn't thought of when it was chosen.
>
>

No. What's the euphemism?

Nick Piggin

unread,
Jan 19, 2005, 8:01:29 AM1/19/05
to William Lee Irwin III, Marcelo Tosatti, Mauricio Lin, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org

Nick Piggin wrote:

>
>
> William Lee Irwin III wrote:
>
>
>> It also is used in a particular euphemism that made
>> it seem odd to me. I suspect it wasn't thought of when it was chosen.
>>
>>
>
> No. What's the euphemism?
>
>

... a few private responses later...

Thanks for the enlightenment, everyone. Next time I won't ask! ;)

But hey, what was the alternative? phd? Nah, I wouldn't like to
give Hugh the satisfaction! (http://lwn.net/Articles/111506/)

Mauricio Lin

unread,
Jan 24, 2005, 5:28:06 PM1/24/05
to Marcelo Tosatti, Andrew Morton, Mauricio Lin, linux-...@vger.kernel.org, Nick Piggin
Hi Tosatti and Andrew,

On Mon, 17 Jan 2005 15:30:23 -0200, Marcelo Tosatti
<marcelo...@cyclades.com> wrote:
>
> Hi Mauricio,
>

> You want to update your patch to handle the new 4level pagetables which introduces
> a new indirection table: the PUD.

Here goes the smaps patch updated for kernel 2.6.11-rc2-bk2 with PUD included.


diff -uprN linux-2.6.11-rc2/Documentation/filesystems/proc.txt
linux-2.6.11-rc2-smaps/Documentation/filesystems/proc.txt
--- linux-2.6.11-rc2/Documentation/filesystems/proc.txt 2004-12-24
17:34:29.000000000 -0400
+++ linux-2.6.11-rc2-smaps/Documentation/filesystems/proc.txt 2005-01-24
17:15:03.000000000 -0400


@@ -133,6 +133,7 @@ Table 1-1: Process specific entries in /
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension of maps, presenting the rss size for each mapped file
..............................................................................

For example, to get the status information of a process, all you have to do is

diff -uprN linux-2.6.11-rc2/Makefile linux-2.6.11-rc2-smaps/Makefile
--- linux-2.6.11-rc2/Makefile 2005-01-24 17:42:02.000000000 -0400
+++ linux-2.6.11-rc2-smaps/Makefile 2005-01-24 11:57:42.000000000 -0400
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 11
-EXTRAVERSION = -rc2-bk2
+EXTRAVERSION = -rc2-bk2-smaps
NAME=Woozy Numbat

# *DOCUMENTATION*
diff -uprN linux-2.6.11-rc2/fs/proc/base.c linux-2.6.11-rc2-smaps/fs/proc/base.c
--- linux-2.6.11-rc2/fs/proc/base.c 2005-01-24 17:41:51.000000000 -0400
+++ linux-2.6.11-rc2-smaps/fs/proc/base.c 2005-01-24 17:02:37.000000000 -0400


@@ -11,6 +11,24 @@
* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ *
+ *
+ * Changelog:

+ * 24-Jan-2005


+ * Allan Bezerra
+ * Bruna Moreira <bruna....@indt.org.br>
+ * Edjard Mota <edjar...@indt.org.br>
+ * Ilias Biris <ext-ili...@indt.org.br>
+ * Mauricio Lin <mauric...@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ * A new process specific entry (smaps) included in /proc. It shows the
+ * size of rss for each memory area. The maps entry lacks information
+ * about physical memory size (rss) for each mapped file, i.e.,
+ * rss information for executables and library files.
+ * This additional information is useful for any tools that need to know
+ * about physical memory consumption for a process specific library.
*/

#include <asm/uaccess.h>

@@ -61,6 +79,7 @@ enum pid_directory_inos {


PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif

@@ -87,6 +106,7 @@ enum pid_directory_inos {


PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif

@@ -124,6 +144,7 @@ static struct pid_entry tgid_base_stuff[


E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -149,6 +170,7 @@ static struct pid_entry tid_base_stuff[]


E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -456,6 +478,26 @@ static struct file_operations proc_maps_


.release = seq_release,
};

+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{

@@ -1300,6 +1342,10 @@ static struct dentry *proc_pident_lookup


case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;

#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;

diff -uprN linux-2.6.11-rc2/fs/proc/task_mmu.c
linux-2.6.11-rc2-smaps/fs/proc/task_mmu.c
--- linux-2.6.11-rc2/fs/proc/task_mmu.c 2005-01-24 17:41:51.000000000 -0400
+++ linux-2.6.11-rc2-smaps/fs/proc/task_mmu.c 2005-01-24
12:06:23.000000000 -0400
@@ -113,6 +113,77 @@ static int show_map(struct seq_file *m,

return 0;
}

+static void resident_mem_size(struct mm_struct *mm,
+ unsigned long start_address,
+ unsigned long end_address,
+ unsigned long *size)
+{
+ pgd_t *pgd;

+ pud_t *pud;


+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ unsigned long each_page;
+
+ for (each_page = start_address; each_page < end_address;
+ each_page += PAGE_SIZE) {
+ pgd = pgd_offset(mm, each_page);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ continue;
+

+ pud = pud_offset(pgd, each_page);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ continue;
+
+ pmd = pmd_offset(pud, each_page);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))


+ continue;
+
+ if (pmd_present(*pmd)) {
+ ptep = pte_offset_map(pmd, each_page);
+ if (!ptep)
+ continue;
+ pte = *ptep;
+ pte_unmap(ptep);
+ if (pte_present(pte))
+ *size += PAGE_SIZE;
+ }
+ }
+}

@@ -166,3 +237,10 @@ struct seq_operations proc_pid_maps_op =


.stop = m_stop,
.show = show_map
};
+
+struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};


Let me know about any suggestions.

BR,

Mauricio Lin.

Mauricio Lin

unread,
Feb 22, 2005, 8:16:36 AM2/22/05
to Hugh Dickins, Andrew Morton, William Irwin, linux-...@vger.kernel.org, Richard F. Rebel, Marcelo Tosatti
Hi All,

Here goes the new smaps patch. As suggested by Hugh in another discussion, the
inefficient loop was removed and replaced by smaps_pgd_range,
smaps_pud_range, smaps_pmd and smaps_pte_range functions. I mantained
the old resident_mem_size function between comments just for anyone
who wants to verify it. BTW, we are using smaps to figure out which
shared libraries that have heavy physical memory comsumption.


diff -uprN linux-2.6.11-rc4-bk9/Documentation/filesystems/proc.txt
linux-2.6.11-rc4-bk9-smaps/Documentation/filesystems/proc.txt
--- linux-2.6.11-rc4-bk9/Documentation/filesystems/proc.txt 2005-02-20
11:35:13.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/Documentation/filesystems/proc.txt 2005-02-20
11:29:42.000000000 -0400


@@ -133,6 +133,7 @@ Table 1-1: Process specific entries in /
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan

+ smaps Extension based on maps, presenting the rss size for each mapped file


..............................................................................

For example, to get the status information of a process, all you have to do is

diff -uprN linux-2.6.11-rc4-bk9/Makefile linux-2.6.11-rc4-bk9-smaps/Makefile
--- linux-2.6.11-rc4-bk9/Makefile 2005-02-20 11:36:00.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/Makefile 2005-02-20 11:31:44.000000000 -0400


@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 11

-EXTRAVERSION = -rc4-bk9
+EXTRAVERSION = -rc4-bk9-smaps
NAME=Woozy Numbat

# *DOCUMENTATION*
diff -uprN linux-2.6.11-rc4-bk9/fs/proc/base.c
linux-2.6.11-rc4-bk9-smaps/fs/proc/base.c
--- linux-2.6.11-rc4-bk9/fs/proc/base.c 2005-02-20 11:35:22.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/fs/proc/base.c 2005-02-20
11:28:00.000000000 -0400
@@ -11,6 +11,28 @@


* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ *
+ *
+ * Changelog:

+ * 17-Jan-2005


+ * Allan Bezerra
+ * Bruna Moreira <bruna....@indt.org.br>
+ * Edjard Mota <edjar...@indt.org.br>
+ * Ilias Biris <ext-ili...@indt.org.br>
+ * Mauricio Lin <mauric...@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ * A new process specific entry (smaps) included in /proc. It shows the
+ * size of rss for each memory area. The maps entry lacks information
+ * about physical memory size (rss) for each mapped file, i.e.,
+ * rss information for executables and library files.
+ * This additional information is useful for any tools that need to know
+ * about physical memory consumption for a process specific library.
+ *
+ * Changelog:

+ * 21-Feb-2005


+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT

*/

#include <asm/uaccess.h>
@@ -61,6 +83,7 @@ enum pid_directory_inos {


PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif

@@ -92,6 +115,7 @@ enum pid_directory_inos {


PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif

@@ -134,6 +158,7 @@ static struct pid_entry tgid_base_stuff[


E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -164,6 +189,7 @@ static struct pid_entry tid_base_stuff[]


E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -488,6 +514,25 @@ static struct file_operations proc_maps_


.release = seq_release,
};

+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+

extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{

@@ -1447,6 +1492,10 @@ static struct dentry *proc_pident_lookup


case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;
#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;

diff -uprN linux-2.6.11-rc4-bk9/fs/proc/task_mmu.c
linux-2.6.11-rc4-bk9-smaps/fs/proc/task_mmu.c
--- linux-2.6.11-rc4-bk9/fs/proc/task_mmu.c 2005-02-20 11:35:22.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/fs/proc/task_mmu.c 2005-02-20
11:21:41.000000000 -0400
@@ -113,6 +113,182 @@ static int show_map(struct seq_file *m,
return 0;
}

+static void smaps_pte_range(pmd_t *pmd,
+ unsigned long address,
+ unsigned long size,
+ unsigned long *rss)
+{
+ pte_t * pte;
+ unsigned long end;
+
+ if (pmd_none(*pmd))
+ return;
+ if (unlikely(pmd_bad(*pmd))) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return;
+ }
+ pte = pte_offset_map(pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ pte_t page = *pte;
+ struct page *ptpage;
+
+ address += PAGE_SIZE;
+ pte++;
+ if (pte_none(page) || (!pte_present(page)))
+ continue;
+ ptpage = pte_page(page);
+ if (!ptpage || PageReserved(ptpage))
+ continue;
+ *rss += PAGE_SIZE;
+
+ } while (address < end);
+}
+
+static void smaps_pmd_range(pud_t *pud,
+ unsigned long address,
+ unsigned long size,
+ unsigned long *rss)
+{
+ pmd_t *pmd;
+ unsigned long end;
+
+ if (pud_none(*pud))
+ return;
+ if (unlikely(pud_bad(*pud))) {
+ pud_ERROR(*pud);
+ pud_clear(pud);
+ return;
+ }
+ pmd = pmd_offset(pud, address);
+ address &= ~PUD_MASK;
+ end = address + size;
+ if (end > PUD_SIZE)
+ end = PUD_SIZE;
+ do {
+ smaps_pte_range(pmd, address, end - address, rss);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+}
+
+static void smaps_pud_range(pgd_t *pgd,
+ unsigned long address,
+ unsigned long size,
+ unsigned long *rss)
+{
+ pud_t *pud;
+ unsigned long end;
+
+ if (pgd_none(*pgd))
+ return;
+ if (unlikely(pgd_bad(*pgd))) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ return;
+ }
+ pud = pud_offset(pgd, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ smaps_pmd_range(pud, address, end - address, rss);
+ address = (address + PUD_SIZE) & PUD_MASK;
+ pud++;
+ } while (address < end);
+}
+
+static void smaps_pgd_range(pgd_t *pgd,


+ unsigned long start_address,
+ unsigned long end_address,

+ unsigned long *rss)
+{
+ while (start_address < end_address) {
+ smaps_pud_range(pgd, start_address, end_address - start_address, rss);
+ start_address = (start_address + PGDIR_SIZE) & PGDIR_MASK;
+ pgd++;
+ }
+}
+
+/*

+*/


+
+static int show_smap(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;
+ struct mm_struct *mm = map->vm_mm;
+ unsigned long rss = 0;
+ unsigned long vma_len = (map->vm_end - map->vm_start) >> 10;
+
+ if (mm) {

+ pgd_t *pgd = pgd_offset(mm, map->vm_start);
+ smaps_pgd_range(pgd, map->vm_start, map->vm_end, &rss);

@@ -166,3 +342,10 @@ struct seq_operations proc_pid_maps_op =


.stop = m_stop,
.show = show_map
};
+
+struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};

BR,

Mauricio Lin.

On Sat, 8 Jan 2005 20:20:39 +0000 (GMT), Hugh Dickins <hu...@veritas.com> wrote:

Mauricio Lin

unread,
Feb 24, 2005, 3:34:05 AM2/24/05
to Hugh Dickins, Andrew Morton, William Irwin, linux-...@vger.kernel.org, Richard F. Rebel, Marcelo Tosatti
Hi Hugh,

You said that the old smaps version is not efficient because the way
it access each pte. So I changed it using pdg_range, pud_range,
pmd_range and pte_range. Now I am trying to measure the efficiency
between the old and new smaps but something is wrong.

I put some timers before and after the function that executes the
traversing algorithm in order to measure the elapsed time.
Both version (old and new smaps) shows 0 jiffies as elapsed time.

Is it anything wrong? Any idea?

BR,

Mauricio Lin.

Andrew Morton

unread,
Feb 24, 2005, 4:12:12 AM2/24/05
to Mauricio Lin, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, Nick Piggin
Mauricio Lin <mauri...@gmail.com> wrote:
>
> You said that the old smaps version is not efficient because the way
> it access each pte.

Nick is talking about changing the kenrel so that it "refcounts pagetable
pages". I'm not sure why.

I assume that this means that each pte page's refcount will be incremented
by one for each instantiated pte. If so, then /proc/pid/smaps can become a
lot more efficient. Just add up the page refcounts on all the pte pages -
no need to walk the ptes themselves.

Maybe?

Mauricio Lin

unread,
Feb 24, 2005, 6:44:36 AM2/24/05
to Andrew Morton, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, Nick Piggin
Hi Andrew,

But can i use jiffies to measure this kind of performance??? AFAIK, if
it is more efficient, then it is faster, right? How can I know how
fast it is? Any idea?

BR,

Mauricio Lin.

Andrew Morton

unread,
Feb 24, 2005, 6:54:21 AM2/24/05
to Mauricio Lin, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Mauricio Lin <mauri...@gmail.com> wrote:
>
> But can i use jiffies to measure this kind of performance??? AFAIK, if
> it is more efficient, then it is faster, right? How can I know how
> fast it is? Any idea?

umm,

time ( for i in $(seq 100); do; cat /proc/nnn/smaps; done > /dev/null )

Albert Cahalan

unread,
Feb 24, 2005, 2:25:37 PM2/24/05
to linux-kernel mailing list, Andrew Morton OSDL
[quoting various people...]

> Here is a new entry developed for /proc that prints for each process
> memory area (VMA) the size of rss. The maps from original kernel is
> able to present the virtual size for each vma, but not the physical
> size (rss). This entry can provide an additional information for tools
> that analyze the memory consumption. You can know the physical memory
> size of each library used by a process and also the executable file.
>
> Take a look the output:
> # cat /proc/877/smaps
> 08048000-08132000 r-xp /usr/bin/xmms
> Size: 936 kB
> Rss: 788 kB

> 08132000-0813a000 rw-p /usr/bin/xmms
> Size: 32 kB
> Rss: 32 kB
> 0813a000-081dd000 rw-p
> Size: 652 kB
> Rss: 616 kB

The most important thing about a /proc file format is that it has
a documented means of being extended in the future. Without such
documentation, it is impossible to write a reliable parser.

The "Name: value" stuff is rather slow. Right now procps (ps, top, etc.)
is using a perfect hash function to parse the /proc/*/status files.
("man gperf") This is just plain gross, but needed for decent performance.

Extending the /proc/*/maps file might be possible. It is commonly used
by debuggers I think, so you'd better at least verify that gdb is OK.
The procps "pmap" tool uses it too. To satisfy the procps parser:

a. no more than 31 flags
b. no '/' prior to the filename
c. nothing after the filename
d. no new fields inserted prior to the inode number

> If there were a use for it, that use might want to distinguish between
> the "shared rss" of pagecache pages from a file, and the "anon rss" of
> private pages copied from file or originally zero - would need to get
> the struct page and check PageAnon. And might want to count swap
> entries too. Hard to say without real uses in mind.

...


> It's a mixture of two different styles, the /proc/<pid>/maps
> many-hex-fields one-vma-per-line style and the /proc/meminfo
> one-decimal-kB-per-line style. I think it would be better following
> the /proc/<pid>/maps style, but replacing the major,minor,ino fields
> by size and rss (anon_rss? swap?) fields (decimal kB? I suppose so).

The more info the better. See the pmap "-x" option, currently missing
some data that the kernel does not supply. There are numerous other
pmap options that are completely unimplemented because of the lack of
info. See the Solaris 10 man page for pmap, available on Sun's web site.

Mauricio Lin

unread,
Feb 25, 2005, 10:18:15 AM2/25/05
to Andrew Morton, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi all,

I tested the two smaps entry using time command.

I tested 100.000 cat commands with smaps for each version.

I checked the difference between the two versions and the new one is
faster than old one. So Hugh is correct about the loop performance.

Thanks!!!

Mauricio Lin.

Mauricio Lin

unread,
Feb 28, 2005, 4:45:18 AM2/28/05
to Andrew Morton, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi all,

I comitted a mistake. Indeed the old smaps is still faster than new one.

Take a look:

Old smaps
real 19.52
user 2.15
sys 17.27

New smaps
real 25.93
user 3.19
sys 22.31

Any comments????

BR,

Mauricio Lin.

Mauricio Lin

unread,
Feb 28, 2005, 4:58:39 AM2/28/05
to Andrew Morton, hu...@veritas.com, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi,

Just some explanation about the mistake.

I have put cat /proc/pid/status instead of /proc/pid/smaps.

So I was testing the /proc/pid/status and not the /proc/pid/smaps.

Now I am testing with /proc/pid/smaps and the values are showing that
the old one is faster than the new one. So I will keep using the old
smaps version.

Any suggestion???

BR,

Mauricio Lin.

Hugh Dickins

unread,
Feb 28, 2005, 3:44:32 PM2/28/05
to Mauricio Lin, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
On Mon, 28 Feb 2005, Mauricio Lin wrote:
>
> Now I am testing with /proc/pid/smaps and the values are showing that
> the old one is faster than the new one. So I will keep using the old
> smaps version.

Sorry, I don't have time for more than the briefest look.

It appears that your old resident_mem_size method is just checking
pte_present, whereas your new smaps_pte_range method is also doing
pte_page (yet no prior check for pfn_valid: wrong) and checking
!PageReserved i.e. accessing the struct page corresponding to each
pte. So it's not a fair comparison, your new method is accessing
many more cachelines than your old method.

Though it's correct to check pfn_valid and !PageReserved to get the
same total rss as would be reported elsewhere, I'd suggest that it's
really not worth the overhead of those struct page accesses: just
stick with the pte_present test.

Your smaps_pte_range is missing pte_unmap?

Hugh

Mauricio Lin

unread,
Mar 1, 2005, 3:11:15 AM3/1/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
On Mon, 28 Feb 2005 20:41:31 +0000 (GMT), Hugh Dickins <hu...@veritas.com> wrote:
> On Mon, 28 Feb 2005, Mauricio Lin wrote:
> >
> > Now I am testing with /proc/pid/smaps and the values are showing that
> > the old one is faster than the new one. So I will keep using the old
> > smaps version.
>
> Sorry, I don't have time for more than the briefest look.
>
> It appears that your old resident_mem_size method is just checking
> pte_present, whereas your new smaps_pte_range method is also doing
> pte_page (yet no prior check for pfn_valid: wrong) and checking
> !PageReserved i.e. accessing the struct page corresponding to each
> pte. So it's not a fair comparison, your new method is accessing
> many more cachelines than your old method.
>
> Though it's correct to check pfn_valid and !PageReserved to get the
> same total rss as would be reported elsewhere, I'd suggest that it's
> really not worth the overhead of those struct page accesses: just
> stick with the pte_present test.
So, I can remove the PageReserved macro without no problems, right?


>
> Your smaps_pte_range is missing pte_unmap?

Yes, but I already fixed this problem. Paul Mundt has checked the
unmap missing.

Thanks,

Let me perform new experiments now.

BR,

Mauricio Lin.

Mauricio Lin

unread,
Mar 1, 2005, 9:20:37 AM3/1/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Well,

It is working better now. You are right Hugh. Now the new version is
faster than the old one. I removed the struct page and its related
function.

Thanks,

BR,

Mauricio Lin.

Mauricio Lin

unread,
Mar 1, 2005, 9:35:43 AM3/1/05
to Albert Cahalan, linux-kernel mailing list, Andrew Morton OSDL
Hi,


> The most important thing about a /proc file format is that it has
> a documented means of being extended in the future. Without such
> documentation, it is impossible to write a reliable parser.
>
> The "Name: value" stuff is rather slow. Right now procps (ps, top, etc.)
> is using a perfect hash function to parse the /proc/*/status files.
> ("man gperf") This is just plain gross, but needed for decent performance.

So, change the output format is important, right?



> Extending the /proc/*/maps file might be possible. It is commonly used
> by debuggers I think, so you'd better at least verify that gdb is OK.
> The procps "pmap" tool uses it too. To satisfy the procps parser:
>
> a. no more than 31 flags
> b. no '/' prior to the filename
> c. nothing after the filename
> d. no new fields inserted prior to the inode number
>

Yes, probably smaps is more feasible for tracking environment. Do you
know any public kernel (I mean kernel version for tracking and
debugging) where can I post the smaps PATCH in order to be included?

BR,

Mauricio Lin.

Mauricio Lin

unread,
Mar 1, 2005, 10:49:40 AM3/1/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi,

Here are some values about the experiments. The values are the elapsed
real time used by the process, in seconds. Each row corresponds to
10000 cat /proc/pid/smaps command.

Old smaps
19.41
19.31
21.38
20.16

New smaps
16.82
16.75
16.75
16.79


BR,

Mauricio Lin.

Mauricio Lin

unread,
Mar 2, 2005, 7:22:15 AM3/2/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Does anyone know if the place I put pte_unmap is logical and safe
after several pte increments?

pte = pte_offset_map(pmd, address);
address &= ~PMD_MASK;


end = address + size;

if (end > PMD_SIZE)
end = PMD_SIZE;
do {
pte_t page = *pte;

address += PAGE_SIZE;
pte++;
if (pte_none(page) || (!pte_present(page)))
continue;
*rss += PAGE_SIZE;
} while (address < end);
pte_unmap(pte);

Hugh Dickins

unread,
Mar 2, 2005, 2:10:06 PM3/2/05
to Mauricio Lin, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
On Wed, 2 Mar 2005, Mauricio Lin wrote:
> Does anyone know if the place I put pte_unmap is logical and safe
> after several pte increments?

The place is logical and safe, but it's still not quite right.
You should have found several examples of loops having the same
problem, and what do they do? ....

> pte = pte_offset_map(pmd, address);
> address &= ~PMD_MASK;
> end = address + size;
> if (end > PMD_SIZE)
> end = PMD_SIZE;
> do {
> pte_t page = *pte;
>
> address += PAGE_SIZE;
> pte++;
> if (pte_none(page) || (!pte_present(page)))
> continue;
> *rss += PAGE_SIZE;
> } while (address < end);
> pte_unmap(pte);

pte_unmap(pte - 1);

which works because it's a do {} while () loop which has certainly
incremented pte at least once. But some people probably loathe that
style, and would prefer to save orig_pte then pte_unmap(orig_pte).

Hugh

Mauricio Lin

unread,
Mar 3, 2005, 2:28:00 AM3/3/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi Hugh,

How about map an unmap each pte?

I mean remove the pte++ and use pte_offset_map for each incremented
address and then pte_unmap. So each incremented address is an index to
get the next pte via pte_offset_map.

BR,

Mauricio Lin.

Hugh Dickins

unread,
Mar 3, 2005, 7:54:23 AM3/3/05
to Mauricio Lin, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
On Thu, 3 Mar 2005, Mauricio Lin wrote:
>
> How about map an unmap each pte?
>
> I mean remove the pte++ and use pte_offset_map for each incremented
> address and then pte_unmap. So each incremented address is an index to
> get the next pte via pte_offset_map.

We're going round in circles.

No. Why would you want to do it that way? Much less efficient.
Mapping and unmapping is expensive - why else would a processor
have a TLB, but to batch these operations?

But probably you're testing without CONFIG_HIGHPTE, with less than
2GB of memory, in which case you'd be unlikely to notice any effect.
(There's highmem in 1GB, but perhaps not enough to rely on the page
tables coming from highmem often enough to show the slowdown.)

When working in an unfamiliar area, follow the example of existing
code which has to do the same kind of thing - various examples in
mm/memory.c and other mm files. (But don't be surprised if they
change in a little while: rewrite coming to get them all in synch.)

The only reason to contemplate mapping one pte at a time is latency:
the per-cpu mapping which pte_offset_map uses means that preemption
must be disabled until it's unmapped, which may cause bad latency.

zap_pte_range is common and has a lot of work to do, involving the
struct page for each pte, so unmap_vmas divides the work into
ZAP_BLOCK_SIZE pieces. Whereas mprotect's change_pte_range is
much more like your case, and just does a whole page table at once.

If you look at those examples, you'll notice spin_lock is held on
page_table_lock, which seems to be missing from your code. You
should add it in: since you're only reading, and your counts are
not critical, on most architectures you'll be safe without the
page_table_lock; but on a few, I suspect it _might_ be possible
to crash the kernel in rare transitory cases without it: be safe.

Mauricio Lin

unread,
Mar 3, 2005, 9:28:56 AM3/3/05
to Hugh Dickins, Andrew Morton, w...@holomorphy.com, linux-...@vger.kernel.org, rre...@whenu.com, marcelo...@cyclades.com, nickp...@yahoo.com.au
Hi all,

I am sending some modifications about smaps PATCH.

BTW, thanks Hugh by all your suggestions. The page_table_lock was
already included in the smaps.

BR,

Mauricio Lin.


diff -uprN linux-2.6.11-rc4-bk9/Documentation/filesystems/proc.txt
linux-2.6.11-rc4-bk9-smaps/Documentation/filesystems/proc.txt
--- linux-2.6.11-rc4-bk9/Documentation/filesystems/proc.txt 2005-02-28
06:24:09.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/Documentation/filesystems/proc.txt 2005-02-28
06:28:10.000000000 -0400


@@ -133,6 +133,7 @@ Table 1-1: Process specific entries in /
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension based on maps, presenting the rss size for each mapped file
..............................................................................

For example, to get the status information of a process, all you have to do is
diff -uprN linux-2.6.11-rc4-bk9/Makefile linux-2.6.11-rc4-bk9-smaps/Makefile

--- linux-2.6.11-rc4-bk9/Makefile 2005-02-28 06:24:59.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/Makefile 2005-02-28 06:28:10.000000000 -0400


@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 11
-EXTRAVERSION = -rc4-bk9
+EXTRAVERSION = -rc4-bk9-smaps
NAME=Woozy Numbat

# *DOCUMENTATION*
diff -uprN linux-2.6.11-rc4-bk9/fs/proc/base.c
linux-2.6.11-rc4-bk9-smaps/fs/proc/base.c

--- linux-2.6.11-rc4-bk9/fs/proc/base.c 2005-02-28 06:24:41.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/fs/proc/base.c 2005-02-28
06:28:10.000000000 -0400

--- linux-2.6.11-rc4-bk9/fs/proc/task_mmu.c 2005-02-28 06:24:41.000000000 -0400
+++ linux-2.6.11-rc4-bk9-smaps/fs/proc/task_mmu.c 2005-02-28
06:32:33.000000000 -0400
@@ -113,6 +113,142 @@ static int show_map(struct seq_file *m,

return 0;
}

+static void smaps_pte_range(pmd_t *pmd,
+ unsigned long address,
+ unsigned long size,
+ unsigned long *rss)
+{
+ pte_t * pte;
+ unsigned long end;
+
+ if (pmd_none(*pmd))
+ return;
+ if (unlikely(pmd_bad(*pmd))) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return;
+ }
+ pte = pte_offset_map(pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ pte_t page = *pte;
+

+ address += PAGE_SIZE;
+ pte++;
+ if (pte_none(page) || (!pte_present(page)))
+ continue;

+ *rss += PAGE_SIZE;
+
+ } while (address < end);

+ pte_unmap(pte);

+ do {


+ smaps_pud_range(pgd, start_address, end_address - start_address, rss);
+ start_address = (start_address + PGDIR_SIZE) & PGDIR_MASK;
+ pgd++;

+ } while (start_address < end_address);
+}


+
+static int show_smap(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;
+ struct mm_struct *mm = map->vm_mm;
+ unsigned long rss = 0;
+ unsigned long vma_len = (map->vm_end - map->vm_start) >> 10;
+
+ if (mm) {

+ spin_lock(&mm->page_table_lock);


+ pgd_t *pgd = pgd_offset(mm, map->vm_start);
+ smaps_pgd_range(pgd, map->vm_start, map->vm_end, &rss);

+ spin_unlock(&mm->page_table_lock);

@@ -166,3 +302,10 @@ struct seq_operations proc_pid_maps_op =


.stop = m_stop,
.show = show_map
};
+
+struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};

Mauricio Lin

unread,
Apr 29, 2005, 2:40:48 PM4/29/05
to Andrew Morton, linux-...@vger.kernel.org
Hi Andrew,

I sent some months ago the PATCH about smaps entry. Here is the new
one with more features included. People that want to perform a memory
consumption analysing can use it mainly if someone needs to figure out
which libraries can be reduced for embedded systems. So the new
features are the physical size of shared and clean [or dirty]; private
and clean [or dirty]. Do you think this is important for Linux
community?

Take a look the example below:

# cat /proc/4576/smaps

08048000-080dc000 r-xp /bin/bash
Size: 592 KB
Rss: 500 KB
Shared_Clean: 500 KB
Shared_Dirty: 0 KB
Private_Clean: 0 KB
Private_Dirty: 0 KB
080dc000-080e2000 rw-p /bin/bash
Size: 24 KB
Rss: 24 KB
Shared_Clean: 0 KB
Shared_Dirty: 0 KB
Private_Clean: 0 KB
Private_Dirty: 24 KB
080e2000-08116000 rw-p
Size: 208 KB
Rss: 208 KB
Shared_Clean: 0 KB
Shared_Dirty: 0 KB
Private_Clean: 0 KB
Private_Dirty: 208 KB
b7e2b000-b7e34000 r-xp /lib/tls/libnss_files-2.3.2.so
Size: 36 KB
Rss: 12 KB
Shared_Clean: 12 KB
Shared_Dirty: 0 KB
Private_Clean: 0 KB
Private_Dirty: 0 KB
..

Here goes the patch:

diff -uprN linux-2.6.11.7/Documentation/filesystems/proc.txt
linux-2.6.11.7-smaps/Documentation/filesystems/proc.txt
--- linux-2.6.11.7/Documentation/filesystems/proc.txt 2005-04-07
14:57:27.000000000 -0400
+++ linux-2.6.11.7-smaps/Documentation/filesystems/proc.txt 2005-04-29
11:10:16.000000000 -0400


@@ -133,6 +133,7 @@ Table 1-1: Process specific entries in /
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension based on maps, presenting the rss size for each mapped file
..............................................................................

For example, to get the status information of a process, all you have to do is

diff -uprN linux-2.6.11.7/fs/proc/base.c linux-2.6.11.7-smaps/fs/proc/base.c
--- linux-2.6.11.7/fs/proc/base.c 2005-04-07 14:57:45.000000000 -0400
+++ linux-2.6.11.7-smaps/fs/proc/base.c 2005-04-29 11:10:16.000000000 -0400
@@ -11,6 +11,40 @@


* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
+ *
+ *
+ * Changelog:
+ * 17-Jan-2005
+ * Allan Bezerra
+ * Bruna Moreira <bruna....@indt.org.br>
+ * Edjard Mota <edjar...@indt.org.br>

+ * Ilias Biris <ilias...@indt.org.br>


+ * Mauricio Lin <mauric...@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ * A new process specific entry (smaps) included in /proc. It shows the
+ * size of rss for each memory area. The maps entry lacks information
+ * about physical memory size (rss) for each mapped file, i.e.,
+ * rss information for executables and library files.
+ * This additional information is useful for any tools that need to know
+ * about physical memory consumption for a process specific library.
+ *
+ * Changelog:
+ * 21-Feb-2005
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT

+ * Pud inclusion in the page table walking.
+ *
+ * ChangeLog:
+ * 10-Mar-2005
+ * 10LE Instituto Nokia de Tecnologia - INdT:
+ * A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ * Simo Piiroinen <simo.pi...@nokia.com>:
+ * Smaps information related to shared, private, clean and dirty pages.
+ *
+ * Paul Mundt <paul....@nokia.com>:
+ * Overall revision about smaps.
*/

#include <asm/uaccess.h>
@@ -61,6 +95,7 @@ enum pid_directory_inos {


PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif

@@ -92,6 +127,7 @@ enum pid_directory_inos {


PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_SMAPS,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif

@@ -134,6 +170,7 @@ static struct pid_entry tgid_base_stuff[


E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -164,6 +201,7 @@ static struct pid_entry tid_base_stuff[]


E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO),
E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO),
#ifdef CONFIG_SECURITY
E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO),
#endif

@@ -488,6 +526,25 @@ static struct file_operations proc_maps_


.release = seq_release,
};

+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ struct task_struct *task = proc_task(inode);
+ int ret = seq_open(file, &proc_pid_smaps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
extern struct seq_operations mounts_op;
static int mounts_open(struct inode *inode, struct file *file)
{

@@ -1447,6 +1504,10 @@ static struct dentry *proc_pident_lookup


case PROC_TGID_MOUNTS:
inode->i_fop = &proc_mounts_operations;
break;
+ case PROC_TID_SMAPS:
+ case PROC_TGID_SMAPS:
+ inode->i_fop = &proc_smaps_operations;
+ break;
#ifdef CONFIG_SECURITY
case PROC_TID_ATTR:
inode->i_nlink = 2;

diff -uprN linux-2.6.11.7/fs/proc/task_mmu.c
linux-2.6.11.7-smaps/fs/proc/task_mmu.c
--- linux-2.6.11.7/fs/proc/task_mmu.c 2005-04-07 14:57:16.000000000 -0400
+++ linux-2.6.11.7-smaps/fs/proc/task_mmu.c 2005-04-29 11:10:16.000000000 -0400
@@ -113,6 +113,178 @@ static int show_map(struct seq_file *m,
return 0;
}

+struct mem_size_stats
+{
+ unsigned long resident;
+ unsigned long shared_clean;
+ unsigned long shared_dirty;
+ unsigned long private_clean;
+ unsigned long private_dirty;
+};
+


+static void smaps_pte_range(pmd_t *pmd,
+ unsigned long address,
+ unsigned long size,

+ struct mem_size_stats *mss)
+{
+ pte_t *ptep, pte;
+ unsigned long end;
+ unsigned long pfn;
+ struct page *page;


+
+ if (pmd_none(*pmd))
+ return;
+ if (unlikely(pmd_bad(*pmd))) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return;
+ }

+ ptep = pte_offset_map(pmd, address);


+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {

+ pte = *ptep;
+ address += PAGE_SIZE;
+ ptep++;
+
+ if (pte_none(pte) || (!pte_present(pte)))
+ continue;
+
+ mss->resident += PAGE_SIZE;
+ pfn = pte_pfn(pte);
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ if (page_count(page) >= 2) {
+ if (pte_dirty(pte))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ }
+ else {
+ if (pte_dirty(pte))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ }


+ }
+ } while (address < end);
+ pte_unmap(pte);
+}
+
+static void smaps_pmd_range(pud_t *pud,
+ unsigned long address,
+ unsigned long size,

+ struct mem_size_stats *mss)


+{
+ pmd_t *pmd;
+ unsigned long end;
+
+ if (pud_none(*pud))
+ return;
+ if (unlikely(pud_bad(*pud))) {
+ pud_ERROR(*pud);
+ pud_clear(pud);
+ return;
+ }
+ pmd = pmd_offset(pud, address);
+ address &= ~PUD_MASK;
+ end = address + size;
+ if (end > PUD_SIZE)
+ end = PUD_SIZE;
+ do {

+ smaps_pte_range(pmd, address, end - address, mss);


+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+}
+
+static void smaps_pud_range(pgd_t *pgd,
+ unsigned long address,
+ unsigned long size,

+ struct mem_size_stats *mss)


+{
+ pud_t *pud;
+ unsigned long end;
+
+ if (pgd_none(*pgd))
+ return;
+ if (unlikely(pgd_bad(*pgd))) {
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ return;
+ }
+ pud = pud_offset(pgd, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {

+ smaps_pmd_range(pud, address, end - address, mss);


+ address = (address + PUD_SIZE) & PUD_MASK;
+ pud++;
+ } while (address < end);
+}
+
+static void smaps_pgd_range(pgd_t *pgd,
+ unsigned long start_address,
+ unsigned long end_address,

+ struct mem_size_stats *mss)
+{
+ do {
+ smaps_pud_range(pgd, start_address, end_address - start_address, mss);


+ start_address = (start_address + PGDIR_SIZE) & PGDIR_MASK;
+ pgd++;
+ } while (start_address < end_address);
+}
+
+static int show_smap(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;
+ struct mm_struct *mm = map->vm_mm;

+ unsigned long vma_len = (map->vm_end - map->vm_start);
+ struct mem_size_stats mss;
+
+ memset(&mss, 0, sizeof mss);
+
+ if (mm) {
+ pgd_t *pgd;
+ spin_lock(&mm->page_table_lock);
+ pgd = pgd_offset(mm, map->vm_start);
+ smaps_pgd_range(pgd, map->vm_start, map->vm_end, &mss);


+ spin_unlock(&mm->page_table_lock);
+ }
+
+ seq_printf(m, "%08lx-%08lx %c%c%c%c ",
+ map->vm_start,
+ map->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p');
+
+ if (map->vm_file)
+ seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+

+ seq_printf(m, "\n"
+ "Size: %8lu KB\n"
+ "Rss: %8lu KB\n"
+ "Shared_Clean: %8lu KB\n"
+ "Shared_Dirty: %8lu KB\n"
+ "Private_Clean: %8lu KB\n"
+ "Private_Dirty: %8lu KB\n",
+ vma_len >> 10,
+ mss.resident >> 10,
+ mss.shared_clean >> 10,
+ mss.shared_dirty >> 10,
+ mss.private_clean >> 10,
+ mss.private_dirty >> 10);


+ return 0;
+}
+
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct task_struct *task = m->private;

@@ -166,3 +338,10 @@ struct seq_operations proc_pid_maps_op =


.stop = m_stop,
.show = show_map
};
+
+struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};


BR,

Mauricio Lin.

On 1/7/05, Andrew Morton <ak...@osdl.org> wrote:
> Mauricio Lin <mauri...@gmail.com> wrote:
> >

> > Here is a new entry developed for /proc that prints for each process
> > memory area (VMA) the size of rss. The maps from original kernel is
> > able to present the virtual size for each vma, but not the physical
> > size (rss). This entry can provide an additional information for tools
> > that analyze the memory consumption. You can know the physical memory
> > size of each library used by a process and also the executable file.
> >
> > Take a look the output:
> > # cat /proc/877/smaps
> > 08048000-08132000 r-xp /usr/bin/xmms
> > Size: 936 kB
> > Rss: 788 kB
>

> This is potentially quite useful. I'd be interested in what others think of
> the idea and implementation.
>

> > Here is the patch:
>
> - It was wordwrapped. Mail the patch to yourself first, make sure it
> still applies.
>
> - Prepare patches with `diff -u'
>
> -
>
> > + extern struct seq_operations proc_pid_smaps_op;
>
> Put extern headers in .h files, not in .c.
>
>
> > + static void resident_mem_size(struct mm_struct *mm, unsigned long
> > start_address,
> > + unsigned long end_address, unsigned long *size) {
> > + pgd_t *pgd;


> > + pmd_t *pmd;
> > + pte_t *ptep, pte;

> > + unsigned long page;
>
> The identifier `page' is usually used for pointers to struct page. Please
> pick another name?


>
> > + if (pte_present(pte)) {
> > + *size += PAGE_SIZE;
> > + }
>

> We prefer to omit the braces if they enclose only a single statement.
>
> > + if (map->vm_file) {
> > + len = sizeof(void*) * 6 - len;
> > + if (len < 1)
> > + len = 1;
> > + seq_printf(m, "%*c", len, ' ');


> > + seq_path(m, file->f_vfsmnt, file->f_dentry, " \t\n\\");
> > + }
>

> hm, that's a bit bizarre. Isn't there a printf construct which will do the
> right-alignment for you? %8u? (See meminfo_read_proc())

Andrew Morton

unread,
Apr 29, 2005, 9:28:13 PM4/29/05
to Mauricio Lin, linux-...@vger.kernel.org
Mauricio Lin <mauri...@gmail.com> wrote:
>
> I sent some months ago the PATCH about smaps entry. Here is the new
> one with more features included. People that want to perform a memory
> consumption analysing can use it mainly if someone needs to figure out
> which libraries can be reduced for embedded systems. So the new
> features are the physical size of shared and clean [or dirty]; private
> and clean [or dirty]. Do you think this is important for Linux
> community?

Well I like it - a couple of years ago an engineer at Digeo developed
basically the same thing as an aid to working out "where the hack has all
our memory gone" for an embedded system.

Some people will get upset about the general performance and scheduling
latency issues which it will introduce. But whatever - I've added it to
-mm.

Reply all
Reply to author
Forward
0 new messages