counting words in word document works in python but not in django

841 views
Skip to first unread message

agoulou...@gmail.com

unread,
Feb 15, 2017, 7:21:54 AM2/15/17
to Django users
Hello everyone, 

I have been struggling with a function for days. The following function opens a word document, count the number of words in it and returns it. but it just doesn't work in my django app. please help me

import os
import re
import docx

from docx import Document

cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
# print("Files in '%s': %s" % (cwd, files))

def doc_size(str):
    mon_fichier = open(str, "rb")
    document = Document(mon_fichier)

    compteur = 0
    for para in document.paragraphs:
        content= para.text
        countage= len(content.split()) 
        compteur = compteur + countage
    return compteur

fichier = "essai.docx"
print(doc_size(fichier))

Here starts my django app
my urls

from django.conf.urls import url, include
from django.contrib import admin
from django.conf.urls.static import static
from django.conf import settings
from . import views
from django.views.generic import TemplateView

urlpatterns = [
    url(r'^$', views.home, name="home"),
    url(r'^soumettre$',TemplateView.as_view(template_name = 'francais/soumettre.html'), name='soumettre'), 
    url(r'^saved$', views.newSubmit, name="saved"),
    url(r'^About$', views.aboutUs, name="about"),
    url(r'^login$',views.formView, name="login"),
    url(r'^loggedin$', views.login, name= "loggedin"),
 

]


Here is my model. It is a model to upload and save a file

class SubmitDoc(models.Model):
    firstName = models.CharField(max_length =100)
    lastName = models.CharField(max_length =100)
    email = models.EmailField()
    uploadDoc = models.FileField(upload_to="documents/%Y/%m/%d/")   
   #
    comment = models.TextField()
    date = models.DateTimeField(auto_now_add=True, verbose_name = "Date of creation")
    
    def obtain_text(self):
        compteur =0
        with open(self.uploadDoc) as data:
            document = Document(data)
            for para in document.paragraphs:
                content = para.text
                countage= len(content.split()) 
                compteur=countage + compteur
            
            return compteur

here is the form that goes with the model



class SubmitDocForm(forms.Form):
    firstName = forms.CharField()
    lastName = forms.CharField()
    email = forms.EmailField()
    uploadDoc = forms.FileField()  
#     CHOICES = (("Document Professionel", 'Document Professionel'), ("Document Scolaire", 'Document Scolaire'))
#     like = forms.TypedChoiceField(choices=CHOICES, widget=forms.RadioSelect) 
    comment = forms.CharField(widget = forms.Textarea)


here is the template used to upload the file 


{% extends "base.html" %}

{% block content%}

  <div class="w3-container w3-orange">
  <h2>Document Submission Form</h2>
 </div>
 <form name = "form" enctype = "multipart/form-data" class="w3-container" action = "{% url "saved" %}" method = "POST" >
   {% csrf_token %}
         
  <p>
  <input class="w3-input" type="text" name="firstName">
  <label>First Name</label></p>

  <p>
  <input class="w3-input" type="text" name="lastName">
  <label>Last Name</label></p>

  <p>
  <input class="w3-input" type="text" name="email">
  <label>Email</label></p>
  <p>

  <p>  </p>
  
  <br>
  <br>

  <div class="w3-container ">
  
   <input type="file" name="uploadDoc" id="uploadDoc">
   <label class="w3-label">load File here</label>
   
   </div>
   <br>
   <br>
   
  <div class="w3-row-padding">
  <div class="w3-third">
  <input class="w3-radio" type="radio" name="like" value="Document Scolaire">
  <label class="w3-validate">Document Scolaire</label>
  </div>
  <div class="w3-third">
  <input class="w3-radio w3-half" type="radio" name="like" value="Document Professionel">
  <label class="w3-validate">Document Professionnel</label>
  </div>
  </div>
   
   
   <div class="w3-container ">
   
   <textarea class="w3-input" name="comment" required></textarea>
            <label class="w3-label">Comments</label>
   
   </div>

 </p>
 

 
 <p style="text-align:center">
  <button class="w3-btn w3-orange" >Submit</button></p>

</form>

{% endblock%}


Here is my view


from django.shortcuts import render, render_to_response
from django.shortcuts import HttpResponse, HttpResponseRedirect
from .forms import SubmitDocForm, LoginForm
from .models import Login, SubmitDoc
from django.template import context, RequestContext
from _datetime import date
from datetime import datetime
import os
import re
import docx
from docx import Document
from django.http.response import HttpResponseRedirect
import logging


def newSubmit(request):
    save= False
    form = SubmitDocForm(request.POST, request.FILES)
    com =0
    compteur=""
    message ="bon"
    full =" "
    if form.is_valid():
        submitDoc = SubmitDoc()
        submitDoc.firstName = form.cleaned_data['firstName'] 
        submitDoc.lastName = form.cleaned_data['lastName']
        submitDoc.email = form.cleaned_data['email']
        submitDoc.uploadDoc = form.cleaned_data['uploadDoc'] 
        submitDoc.like = form.cleaned_data["like"]  
        submitDoc.comment = form.cleaned_data['comment']
        submitDoc.save()
        data = request.FILES["uploadDoc"]
          
        save = True
        compteur = doc_size_handle(data)
     
    else:
        form = SubmitDocForm()
         
    return render(request, 'francais/saved.html', {"form" : form, "compteur": compteur, "save": save, 
                                                   "com": com, "message" : message, "object": object})




def doc_size_handle(f):
    mon_fichier = f.open("rb")
    document = Document(mon_fichier)
    content =""
    compteur = 0
    for para in document.paragraphs: # I FEEL LIKE THERE IS A PROBLEM WITH THIS LOOP BUT WHY?
        content = para.text
        countage= len(content.split()) 
        compteur = compteur + countage
    f.close()  
    
    return content


HERE IS THE TEMPLATE MEANT TO DISPLAY THE NUMBER OF WORDS IN THE FILE. However, this "compteur" variable does not get updated, please help me



      {% if save %}
     
         <strong>      
  your document has been received. We will review it and get back to you.
 
  <p> current directory  </p>
  <p> size: {{compteur}} </p>
  <p>  </p>
  <p> </p>
 
 
  </strong>
 
      {% endif %}
      
      {% if not save %}
         <strong>Your document was not received. Please try again with the appropriate format.</strong>
      {% endif %}

    




home.html
saved.html
soumettre.html

Matthew Pava

unread,
Feb 15, 2017, 9:25:29 AM2/15/17
to django...@googlegroups.com

You may want to consider a different package or python function.

Here is a way to extract only the text from a docx without using python-docx:

http://etienned.github.io/posts/extract-text-from-word-docx-simply/

 

There is also another script that appears to be more accurate by taking headers and footers into account.

https://github.com/ankushshah89/python-docx2txt

 

I am interested in your question because I am interested in examining Word documents through automation as well.  My database used to be in Microsoft Access before I moved it to Django/PostgreSQL, so I’m quite familiar with VBA and modifying Word documents through VBA in the COM interface.

 

I would prefer pywin32 and following this tutorial:

http://new.galalaly.me/2011/09/use-python-to-parse-microsoft-word-documents-using-pywin32-library/

 

And in VBA, I would be able to use this command to get the Word Count:

Word.ActiveDocument.Words.Count

 

It’s just a matter of converting that into Python.

I hope you find this helpful.

--
You received this message because you are subscribed to the Google Groups "Django users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to django-users...@googlegroups.com.
To post to this group, send email to django...@googlegroups.com.
Visit this group at https://groups.google.com/group/django-users.
To view this discussion on the web visit https://groups.google.com/d/msgid/django-users/7576d9c9-f759-40b2-b0c9-a636a9c4f35f%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

agoulou...@gmail.com

unread,
Feb 15, 2017, 5:44:04 PM2/15/17
to Django users, Matthe...@iss.com
Tnak you very much. I tried the first version. and added  the following code to my view when save. And it works. I am very grateful. Thanks a lot



pathy = submitDoc.uploadDoc.path
        solution= get_docx_text(pathy)
        counted = len(solution.split())

To post to this group, send email to djang...@googlegroups.com.

Reply all
Reply to author
Forward
0 new messages